From 3a5e0dedb94b9f83a3fd9331061fd927422d13f8 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 11:50:38 +0200 Subject: [PATCH 01/67] Initialization of dataset module --- backend/dataall/modules/datasets/__init__.py | 18 ++++++++++++++++++ .../dataall/modules/datasets/api/__init__.py | 1 + 2 files changed, 19 insertions(+) create mode 100644 backend/dataall/modules/datasets/__init__.py create mode 100644 backend/dataall/modules/datasets/api/__init__.py diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py new file mode 100644 index 000000000..67298a06e --- /dev/null +++ b/backend/dataall/modules/datasets/__init__.py @@ -0,0 +1,18 @@ +"""Contains the code related to datasets""" +import logging +from dataall.modules.loader import ModuleInterface, ImportMode + +log = logging.getLogger(__name__) + + +class DatasetApiModuleInterface(ModuleInterface): + """Implements ModuleInterface for notebook GraphQl lambda""" + + @classmethod + def is_supported(cls, modes): + return ImportMode.API in modes + + def __init__(self): + import dataall.modules.datasets.api + log.info("API of datasets has been imported") + diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py new file mode 100644 index 000000000..13cf9331d --- /dev/null +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -0,0 +1 @@ +"""The GraphQL schema of datasets and related functionality""" From a50a02f39b05d64d1e6bc410afe956a66eeb5602 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 11:55:30 +0200 Subject: [PATCH 02/67] Refactoring of datasets Moved dataset table column to modules --- backend/dataall/api/Objects/DatasetTable/schema.py | 2 +- backend/dataall/api/Objects/__init__.py | 1 - .../datasets/api}/DatasetTableColumn/__init__.py | 2 +- .../datasets/api}/DatasetTableColumn/input_types.py | 2 +- .../datasets/api}/DatasetTableColumn/mutations.py | 7 +++++-- .../datasets/api}/DatasetTableColumn/queries.py | 4 ++-- .../datasets/api}/DatasetTableColumn/resolvers.py | 10 +++++----- .../datasets/api}/DatasetTableColumn/schema.py | 4 ++-- backend/dataall/modules/datasets/api/__init__.py | 5 +++++ 9 files changed, 22 insertions(+), 15 deletions(-) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/__init__.py (70%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/input_types.py (94%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/mutations.py (78%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/queries.py (74%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/resolvers.py (94%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/schema.py (93%) diff --git a/backend/dataall/api/Objects/DatasetTable/schema.py b/backend/dataall/api/Objects/DatasetTable/schema.py index dc1cffcb4..0436c0f2e 100644 --- a/backend/dataall/api/Objects/DatasetTable/schema.py +++ b/backend/dataall/api/Objects/DatasetTable/schema.py @@ -1,4 +1,4 @@ -from ..DatasetTableColumn.resolvers import list_table_columns +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns from ... import gql from .resolvers import * from ...constants import GraphQLEnumMapper diff --git a/backend/dataall/api/Objects/__init__.py b/backend/dataall/api/Objects/__init__.py index 060f2ba6e..43d5e0833 100644 --- a/backend/dataall/api/Objects/__init__.py +++ b/backend/dataall/api/Objects/__init__.py @@ -18,7 +18,6 @@ Environment, Activity, DatasetTable, - DatasetTableColumn, Dataset, Group, Principal, diff --git a/backend/dataall/api/Objects/DatasetTableColumn/__init__.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py similarity index 70% rename from backend/dataall/api/Objects/DatasetTableColumn/__init__.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py index dfa46b264..691b10331 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/__init__.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py @@ -1,4 +1,4 @@ -from . import ( +from dataall.modules.datasets.api.DatasetTableColumn import ( input_types, mutations, queries, diff --git a/backend/dataall/api/Objects/DatasetTableColumn/input_types.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py similarity index 94% rename from backend/dataall/api/Objects/DatasetTableColumn/input_types.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py index 24fbbdbca..745e7f271 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/input_types.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py @@ -1,4 +1,4 @@ -from ... import gql +from dataall.api import gql DatasetTableColumnFilter = gql.InputType( name='DatasetTableColumnFilter', diff --git a/backend/dataall/api/Objects/DatasetTableColumn/mutations.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py similarity index 78% rename from backend/dataall/api/Objects/DatasetTableColumn/mutations.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py index 012d83ea7..f7b682e3d 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/mutations.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py @@ -1,5 +1,8 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import ( + sync_table_columns, + update_table_column +) syncDatasetTableColumns = gql.MutationField( name='syncDatasetTableColumns', diff --git a/backend/dataall/api/Objects/DatasetTableColumn/queries.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py similarity index 74% rename from backend/dataall/api/Objects/DatasetTableColumn/queries.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py index 4f5f05646..0a08e37b6 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/queries.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py @@ -1,5 +1,5 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns listDatasetTableColumns = gql.QueryField( name='listDatasetTableColumns', diff --git a/backend/dataall/api/Objects/DatasetTableColumn/resolvers.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py similarity index 94% rename from backend/dataall/api/Objects/DatasetTableColumn/resolvers.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py index 88bf2c728..a7a1bf5f4 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/resolvers.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py @@ -1,10 +1,10 @@ from sqlalchemy import or_ -from .... import db -from ....api.context import Context -from ....aws.handlers.service_handlers import Worker -from ....db import paginate, permissions, models -from ....db.api import ResourcePolicy +from dataall import db +from dataall.api.context import Context +from dataall.aws.handlers.service_handlers import Worker +from dataall.db import paginate, permissions, models +from dataall.db.api import ResourcePolicy def list_table_columns( diff --git a/backend/dataall/api/Objects/DatasetTableColumn/schema.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py similarity index 93% rename from backend/dataall/api/Objects/DatasetTableColumn/schema.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py index d571fc9a6..8772e99b7 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/schema.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py @@ -1,5 +1,5 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import resolve_terms DatasetTableColumn = gql.ObjectType( diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index 13cf9331d..f79e9a30c 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -1 +1,6 @@ """The GraphQL schema of datasets and related functionality""" +from dataall.modules.datasets.api import ( + DatasetTableColumn +) + +__all__ = ["DatasetTableColumn"] From be1498689d48aa63aae3eba763635f8af800148b Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 12:09:05 +0200 Subject: [PATCH 03/67] Refactoring of datasets Renamed table column to the python's convention format --- backend/dataall/api/Objects/DatasetTable/schema.py | 2 +- backend/dataall/modules/datasets/api/__init__.py | 4 ++-- .../api/{DatasetTableColumn => table_column}/__init__.py | 2 +- .../api/{DatasetTableColumn => table_column}/input_types.py | 0 .../api/{DatasetTableColumn => table_column}/mutations.py | 2 +- .../api/{DatasetTableColumn => table_column}/queries.py | 2 +- .../api/{DatasetTableColumn => table_column}/resolvers.py | 0 .../api/{DatasetTableColumn => table_column}/schema.py | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/__init__.py (70%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/input_types.py (100%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/mutations.py (89%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/queries.py (80%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/resolvers.py (100%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/schema.py (95%) diff --git a/backend/dataall/api/Objects/DatasetTable/schema.py b/backend/dataall/api/Objects/DatasetTable/schema.py index 0436c0f2e..74d413818 100644 --- a/backend/dataall/api/Objects/DatasetTable/schema.py +++ b/backend/dataall/api/Objects/DatasetTable/schema.py @@ -1,4 +1,4 @@ -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns +from dataall.modules.datasets.api.table_column.resolvers import list_table_columns from ... import gql from .resolvers import * from ...constants import GraphQLEnumMapper diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index f79e9a30c..538df0734 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -1,6 +1,6 @@ """The GraphQL schema of datasets and related functionality""" from dataall.modules.datasets.api import ( - DatasetTableColumn + table_column ) -__all__ = ["DatasetTableColumn"] +__all__ = ["table_column"] diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py b/backend/dataall/modules/datasets/api/table_column/__init__.py similarity index 70% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py rename to backend/dataall/modules/datasets/api/table_column/__init__.py index 691b10331..070301f58 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py +++ b/backend/dataall/modules/datasets/api/table_column/__init__.py @@ -1,4 +1,4 @@ -from dataall.modules.datasets.api.DatasetTableColumn import ( +from dataall.modules.datasets.api.table_column import ( input_types, mutations, queries, diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py b/backend/dataall/modules/datasets/api/table_column/input_types.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py rename to backend/dataall/modules/datasets/api/table_column/input_types.py diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py b/backend/dataall/modules/datasets/api/table_column/mutations.py similarity index 89% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py rename to backend/dataall/modules/datasets/api/table_column/mutations.py index f7b682e3d..0fc5a7d87 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py +++ b/backend/dataall/modules/datasets/api/table_column/mutations.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import ( +from dataall.modules.datasets.api.table_column.resolvers import ( sync_table_columns, update_table_column ) diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py b/backend/dataall/modules/datasets/api/table_column/queries.py similarity index 80% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py rename to backend/dataall/modules/datasets/api/table_column/queries.py index 0a08e37b6..2c29e94b7 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py +++ b/backend/dataall/modules/datasets/api/table_column/queries.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns +from dataall.modules.datasets.api.table_column.resolvers import list_table_columns listDatasetTableColumns = gql.QueryField( name='listDatasetTableColumns', diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py rename to backend/dataall/modules/datasets/api/table_column/resolvers.py diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py b/backend/dataall/modules/datasets/api/table_column/schema.py similarity index 95% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py rename to backend/dataall/modules/datasets/api/table_column/schema.py index 8772e99b7..9730b70b9 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py +++ b/backend/dataall/modules/datasets/api/table_column/schema.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import resolve_terms +from dataall.modules.datasets.api.table_column.resolvers import resolve_terms DatasetTableColumn = gql.ObjectType( From 06f82ad80386f53780af735571f3b7eb66497594 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 12:16:34 +0200 Subject: [PATCH 04/67] Refactoring of datasets Added dataset module to config.json --- config.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config.json b/config.json index e0a9d85d0..4aed5ef87 100644 --- a/config.json +++ b/config.json @@ -2,6 +2,9 @@ "modules": { "notebooks": { "active": true + }, + "datasets": { + "active": true } } } \ No newline at end of file From 38145ae637a2084c1bb198a1fa76a395bd1c39b5 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 12:32:00 +0200 Subject: [PATCH 05/67] Fixed leftover in loader --- backend/dataall/modules/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/dataall/modules/loader.py b/backend/dataall/modules/loader.py index 95aa2083a..aa4a656d4 100644 --- a/backend/dataall/modules/loader.py +++ b/backend/dataall/modules/loader.py @@ -57,7 +57,7 @@ def load_modules(modes: List[ImportMode]) -> None: log.info(f"Module {name} is not active. Skipping...") continue - if active.lower() == "true" and not _import_module(name): + if not _import_module(name): raise ValueError(f"Couldn't find module {name} under modules directory") log.info(f"Module {name} is loaded") From f0e146aa2bf5cfb2156a369f9690adb9cfda9c09 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 13:56:04 +0200 Subject: [PATCH 06/67] Dataset refactoring Moved database table service --- .../api/Objects/DatasetProfiling/resolvers.py | 3 ++- .../api/Objects/DatasetTable/resolvers.py | 25 ++++++++++--------- backend/dataall/aws/handlers/glue.py | 5 ++-- backend/dataall/aws/handlers/redshift.py | 3 ++- backend/dataall/db/api/__init__.py | 1 - .../datasets/api/table_column/resolvers.py | 7 +++--- .../modules/datasets/services/__init__.py | 1 + .../datasets/services}/dataset_table.py | 9 +++---- .../subscriptions/subscription_service.py | 5 ++-- backend/dataall/tasks/tables_syncer.py | 3 ++- tests/api/test_dataset_table.py | 5 ++-- 11 files changed, 36 insertions(+), 31 deletions(-) create mode 100644 backend/dataall/modules/datasets/services/__init__.py rename backend/dataall/{db/api => modules/datasets/services}/dataset_table.py (98%) diff --git a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py index 678a8cba6..c391a1a8c 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py +++ b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py @@ -6,6 +6,7 @@ from ....aws.handlers.sts import SessionHelper from ....db import api, permissions, models from ....db.api import ResourcePolicy +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger(__name__) @@ -97,7 +98,7 @@ def get_last_table_profiling_run(context: Context, source, tableUri=None): if run: if not run.results: - table = api.DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) dataset = api.Dataset.get_dataset_by_uri(session, table.datasetUri) environment = api.Environment.get_environment_by_uri( session, dataset.environmentUri diff --git a/backend/dataall/api/Objects/DatasetTable/resolvers.py b/backend/dataall/api/Objects/DatasetTable/resolvers.py index 9ea811411..854b99de9 100644 --- a/backend/dataall/api/Objects/DatasetTable/resolvers.py +++ b/backend/dataall/api/Objects/DatasetTable/resolvers.py @@ -13,13 +13,14 @@ from ....db.api import ResourcePolicy, Glossary from ....searchproxy import indexers from ....utils import json_utils +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger(__name__) def create_table(context, source, datasetUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.create_dataset_table( + table = DatasetTable.create_dataset_table( session=session, username=context.username, groups=context.groups, @@ -37,7 +38,7 @@ def list_dataset_tables(context, source, filter: dict = None): if not filter: filter = {} with context.engine.scoped_session() as session: - return db.api.DatasetTable.list_dataset_tables( + return DatasetTable.list_dataset_tables( session=session, username=context.username, groups=context.groups, @@ -49,8 +50,8 @@ def list_dataset_tables(context, source, filter: dict = None): def get_table(context, source: models.Dataset, tableUri: str = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) - return db.api.DatasetTable.get_dataset_table( + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + return DatasetTable.get_dataset_table( session=session, username=context.username, groups=context.groups, @@ -64,14 +65,14 @@ def get_table(context, source: models.Dataset, tableUri: str = None): def update_table(context, source, tableUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) input['table'] = table input['tableUri'] = table.tableUri - db.api.DatasetTable.update_dataset_table( + DatasetTable.update_dataset_table( session=session, username=context.username, groups=context.groups, @@ -85,8 +86,8 @@ def update_table(context, source, tableUri: str = None, input: dict = None): def delete_table(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) - db.api.DatasetTable.delete_dataset_table( + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + DatasetTable.delete_dataset_table( session=session, username=context.username, groups=context.groups, @@ -102,7 +103,7 @@ def delete_table(context, source, tableUri: str = None): def preview(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, tableUri ) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) @@ -157,7 +158,7 @@ def get_glue_table_properties(context: Context, source: models.DatasetTable, **k if not source: return None with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, source.tableUri ) return json_utils.to_string(table.GlueTableProperties).replace('\\', ' ') @@ -186,7 +187,7 @@ def resolve_glossary_terms(context: Context, source: models.DatasetTable, **kwar def publish_table_update(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -235,7 +236,7 @@ def resolve_redshift_copy_location( def list_shared_tables_by_env_dataset(context: Context, source, datasetUri: str, envUri: str, filter: dict = None): with context.engine.scoped_session() as session: - return db.api.DatasetTable.get_dataset_tables_shared_with_env( + return DatasetTable.get_dataset_tables_shared_with_env( session, envUri, datasetUri diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index ebeb1fca6..468268640 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,6 +6,7 @@ from .sts import SessionHelper from ... import db from ...db import models +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger('aws:glue') @@ -84,7 +85,7 @@ def list_tables(engine, task: models.Task): tables = Glue.list_glue_database_tables( accountid, dataset.GlueDatabaseName, region ) - db.api.DatasetTable.sync(session, dataset.datasetUri, glue_tables=tables) + DatasetTable.sync(session, dataset.datasetUri, glue_tables=tables) return tables @staticmethod @@ -642,7 +643,7 @@ def get_table_columns(engine, task: models.Task): f'//{dataset_table.name} due to: ' f'{e}' ) - db.api.DatasetTable.sync_table_columns( + DatasetTable.sync_table_columns( session, dataset_table, glue_table['Table'] ) return True diff --git a/backend/dataall/aws/handlers/redshift.py b/backend/dataall/aws/handlers/redshift.py index a6a02f9e7..a1f479417 100644 --- a/backend/dataall/aws/handlers/redshift.py +++ b/backend/dataall/aws/handlers/redshift.py @@ -9,6 +9,7 @@ from .sts import SessionHelper from ... import db from ...db import models +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger(__name__) @@ -446,7 +447,7 @@ def copy_data(engine, task: models.Task): session, task.payload['datasetUri'] ) - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, task.payload['tableUri'] ) diff --git a/backend/dataall/db/api/__init__.py b/backend/dataall/db/api/__init__.py index 01647c81b..19138f7d7 100644 --- a/backend/dataall/db/api/__init__.py +++ b/backend/dataall/db/api/__init__.py @@ -14,7 +14,6 @@ from .dataset import Dataset from .dataset_location import DatasetStorageLocation from .dataset_profiling_run import DatasetProfilingRun -from .dataset_table import DatasetTable from .notification import Notification from .redshift_cluster import RedshiftCluster from .vpc import Vpc diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index a7a1bf5f4..8d977403a 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -5,6 +5,7 @@ from dataall.aws.handlers.service_handlers import Worker from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy +from dataall.modules.datasets.services.dataset_table import DatasetTable def list_table_columns( @@ -19,7 +20,7 @@ def list_table_columns( filter = {} with context.engine.scoped_session() as session: if not source: - source = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) + source = DatasetTable.get_dataset_table_by_uri(session, tableUri) q = ( session.query(models.DatasetTableColumn) .filter( @@ -44,7 +45,7 @@ def list_table_columns( def sync_table_columns(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -79,7 +80,7 @@ def update_table_column( ).get(columnUri) if not column: raise db.exceptions.ObjectNotFound('Column', columnUri) - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, column.tableUri ) ResourcePolicy.check_user_resource_permission( diff --git a/backend/dataall/modules/datasets/services/__init__.py b/backend/dataall/modules/datasets/services/__init__.py new file mode 100644 index 000000000..03ef29863 --- /dev/null +++ b/backend/dataall/modules/datasets/services/__init__.py @@ -0,0 +1 @@ +"""Contains business logic for datasets""" diff --git a/backend/dataall/db/api/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py similarity index 98% rename from backend/dataall/db/api/dataset_table.py rename to backend/dataall/modules/datasets/services/dataset_table.py index 77ee515e3..7c46120c1 100644 --- a/backend/dataall/db/api/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -1,12 +1,11 @@ import logging -from typing import List from sqlalchemy.sql import and_ -from .. import models, api, permissions, exceptions, paginate -from . import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment -from ..models import Dataset -from ...utils import json_utils +from dataall.db import models, api, permissions, exceptions, paginate +from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment +from dataall.db.models import Dataset +from dataall.utils import json_utils logger = logging.getLogger(__name__) diff --git a/backend/dataall/tasks/subscriptions/subscription_service.py b/backend/dataall/tasks/subscriptions/subscription_service.py index 52aeb4e40..7b2a6d461 100644 --- a/backend/dataall/tasks/subscriptions/subscription_service.py +++ b/backend/dataall/tasks/subscriptions/subscription_service.py @@ -14,6 +14,7 @@ from ...db import models from ...tasks.subscriptions import poll_queues from ...utils import json_utils +from dataall.modules.datasets.services.dataset_table import DatasetTable root = logging.getLogger() root.setLevel(logging.INFO) @@ -64,7 +65,7 @@ def notify_consumers(engine, messages): @staticmethod def publish_table_update_message(engine, message): with engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), @@ -135,7 +136,7 @@ def publish_location_update_message(session, message): @staticmethod def store_dataquality_results(session, message): - table: models.DatasetTable = db.api.DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), diff --git a/backend/dataall/tasks/tables_syncer.py b/backend/dataall/tasks/tables_syncer.py index 04bafdfa5..5e6e8d48e 100644 --- a/backend/dataall/tasks/tables_syncer.py +++ b/backend/dataall/tasks/tables_syncer.py @@ -13,6 +13,7 @@ connect, ) from ..utils.alarm_service import AlarmService +from dataall.modules.datasets.services.dataset_table import DatasetTable root = logging.getLogger() root.setLevel(logging.INFO) @@ -63,7 +64,7 @@ def sync_tables(engine, es=None): f'Found {len(tables)} tables on Glue database {dataset.GlueDatabaseName}' ) - db.api.DatasetTable.sync( + DatasetTable.sync( session, dataset.datasetUri, glue_tables=tables ) diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index 6c30e77ea..af27529d5 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -3,6 +3,7 @@ import pytest import dataall +from dataall.modules.datasets.services.dataset_table import DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -289,9 +290,7 @@ def test_sync_tables_and_columns(client, table, dataset1, db): }, ] - assert dataall.db.api.DatasetTable.sync( - session, dataset1.datasetUri, glue_tables - ) + assert DatasetTable.sync(session, dataset1.datasetUri, glue_tables) new_table: dataall.db.models.DatasetTable = ( session.query(dataall.db.models.DatasetTable) .filter(dataall.db.models.DatasetTable.name == 'new_table') From b039163449245ef6c079f3c277ce52e2a5cda579 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:02:44 +0200 Subject: [PATCH 07/67] Dataset refactoring Renamed DatasetTable to DatasetTableService to avoid collisions with models.DatasetTable --- .../api/Objects/DatasetProfiling/resolvers.py | 4 +-- .../api/Objects/DatasetTable/resolvers.py | 26 +++++++++---------- backend/dataall/aws/handlers/glue.py | 6 ++--- backend/dataall/aws/handlers/redshift.py | 4 +-- .../datasets/api/table_column/resolvers.py | 8 +++--- .../datasets/services/dataset_table.py | 16 ++++++------ .../subscriptions/subscription_service.py | 6 ++--- backend/dataall/tasks/tables_syncer.py | 4 +-- tests/api/test_dataset_table.py | 4 +-- 9 files changed, 39 insertions(+), 39 deletions(-) diff --git a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py index c391a1a8c..4b4684019 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py +++ b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py @@ -6,7 +6,7 @@ from ....aws.handlers.sts import SessionHelper from ....db import api, permissions, models from ....db.api import ResourcePolicy -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -98,7 +98,7 @@ def get_last_table_profiling_run(context: Context, source, tableUri=None): if run: if not run.results: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) dataset = api.Dataset.get_dataset_by_uri(session, table.datasetUri) environment = api.Environment.get_environment_by_uri( session, dataset.environmentUri diff --git a/backend/dataall/api/Objects/DatasetTable/resolvers.py b/backend/dataall/api/Objects/DatasetTable/resolvers.py index 854b99de9..3e2b833e3 100644 --- a/backend/dataall/api/Objects/DatasetTable/resolvers.py +++ b/backend/dataall/api/Objects/DatasetTable/resolvers.py @@ -13,14 +13,14 @@ from ....db.api import ResourcePolicy, Glossary from ....searchproxy import indexers from ....utils import json_utils -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) def create_table(context, source, datasetUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = DatasetTable.create_dataset_table( + table = DatasetTableService.create_dataset_table( session=session, username=context.username, groups=context.groups, @@ -38,7 +38,7 @@ def list_dataset_tables(context, source, filter: dict = None): if not filter: filter = {} with context.engine.scoped_session() as session: - return DatasetTable.list_dataset_tables( + return DatasetTableService.list_dataset_tables( session=session, username=context.username, groups=context.groups, @@ -50,8 +50,8 @@ def list_dataset_tables(context, source, filter: dict = None): def get_table(context, source: models.Dataset, tableUri: str = None): with context.engine.scoped_session() as session: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) - return DatasetTable.get_dataset_table( + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) + return DatasetTableService.get_dataset_table( session=session, username=context.username, groups=context.groups, @@ -65,14 +65,14 @@ def get_table(context, source: models.Dataset, tableUri: str = None): def update_table(context, source, tableUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) input['table'] = table input['tableUri'] = table.tableUri - DatasetTable.update_dataset_table( + DatasetTableService.update_dataset_table( session=session, username=context.username, groups=context.groups, @@ -86,8 +86,8 @@ def update_table(context, source, tableUri: str = None, input: dict = None): def delete_table(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) - DatasetTable.delete_dataset_table( + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) + DatasetTableService.delete_dataset_table( session=session, username=context.username, groups=context.groups, @@ -103,7 +103,7 @@ def delete_table(context, source, tableUri: str = None): def preview(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) @@ -158,7 +158,7 @@ def get_glue_table_properties(context: Context, source: models.DatasetTable, **k if not source: return None with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, source.tableUri ) return json_utils.to_string(table.GlueTableProperties).replace('\\', ' ') @@ -187,7 +187,7 @@ def resolve_glossary_terms(context: Context, source: models.DatasetTable, **kwar def publish_table_update(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -236,7 +236,7 @@ def resolve_redshift_copy_location( def list_shared_tables_by_env_dataset(context: Context, source, datasetUri: str, envUri: str, filter: dict = None): with context.engine.scoped_session() as session: - return DatasetTable.get_dataset_tables_shared_with_env( + return DatasetTableService.get_dataset_tables_shared_with_env( session, envUri, datasetUri diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index 468268640..88f68fc84 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,7 +6,7 @@ from .sts import SessionHelper from ... import db from ...db import models -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger('aws:glue') @@ -85,7 +85,7 @@ def list_tables(engine, task: models.Task): tables = Glue.list_glue_database_tables( accountid, dataset.GlueDatabaseName, region ) - DatasetTable.sync(session, dataset.datasetUri, glue_tables=tables) + DatasetTableService.sync(session, dataset.datasetUri, glue_tables=tables) return tables @staticmethod @@ -643,7 +643,7 @@ def get_table_columns(engine, task: models.Task): f'//{dataset_table.name} due to: ' f'{e}' ) - DatasetTable.sync_table_columns( + DatasetTableService.sync_table_columns( session, dataset_table, glue_table['Table'] ) return True diff --git a/backend/dataall/aws/handlers/redshift.py b/backend/dataall/aws/handlers/redshift.py index a1f479417..4d2591520 100644 --- a/backend/dataall/aws/handlers/redshift.py +++ b/backend/dataall/aws/handlers/redshift.py @@ -9,7 +9,7 @@ from .sts import SessionHelper from ... import db from ...db import models -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -447,7 +447,7 @@ def copy_data(engine, task: models.Task): session, task.payload['datasetUri'] ) - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, task.payload['tableUri'] ) diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index 8d977403a..e5e7fd60c 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -5,7 +5,7 @@ from dataall.aws.handlers.service_handlers import Worker from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService def list_table_columns( @@ -20,7 +20,7 @@ def list_table_columns( filter = {} with context.engine.scoped_session() as session: if not source: - source = DatasetTable.get_dataset_table_by_uri(session, tableUri) + source = DatasetTableService.get_dataset_table_by_uri(session, tableUri) q = ( session.query(models.DatasetTableColumn) .filter( @@ -45,7 +45,7 @@ def list_table_columns( def sync_table_columns(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -80,7 +80,7 @@ def update_table_column( ).get(columnUri) if not column: raise db.exceptions.ObjectNotFound('Column', columnUri) - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, column.tableUri ) ResourcePolicy.check_user_resource_permission( diff --git a/backend/dataall/modules/datasets/services/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py index 7c46120c1..eeeb99f1d 100644 --- a/backend/dataall/modules/datasets/services/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -class DatasetTable: +class DatasetTableService: @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @has_resource_perm(permissions.CREATE_DATASET_TABLE) @@ -107,7 +107,7 @@ def get_dataset_table( data: dict = None, check_perm: bool = False, ) -> models.DatasetTable: - return DatasetTable.get_dataset_table_by_uri(session, data['tableUri']) + return DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @@ -122,7 +122,7 @@ def update_dataset_table( ): table = data.get( 'table', - DatasetTable.get_dataset_table_by_uri(session, data['tableUri']), + DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']), ) for k in [attr for attr in data.keys() if attr != 'term']: @@ -146,7 +146,7 @@ def delete_dataset_table( data: dict = None, check_perm: bool = False, ): - table = DatasetTable.get_dataset_table_by_uri(session, data['tableUri']) + table = DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() share_item = ( session.query(models.ShareObjectItem) @@ -210,7 +210,7 @@ def get_dataset_tables_shared_with_env( ): return [ {"tableUri": t.tableUri, "GlueTableName": t.GlueTableName} - for t in DatasetTable.query_dataset_tables_shared_with_env( + for t in DatasetTableService.query_dataset_tables_shared_with_env( session, environment_uri, dataset_uri ) ] @@ -235,7 +235,7 @@ def sync(session, datasetUri, glue_tables=None): existing_table_names = [e.GlueTableName for e in existing_tables] existing_dataset_tables_map = {t.GlueTableName: t for t in existing_tables} - DatasetTable.update_existing_tables_status(existing_tables, glue_tables) + DatasetTableService.update_existing_tables_status(existing_tables, glue_tables) logger.info( f'existing_tables={glue_tables}' ) @@ -284,7 +284,7 @@ def sync(session, datasetUri, glue_tables=None): table.get('Parameters', {}) ) - DatasetTable.sync_table_columns(session, updated_table, table) + DatasetTableService.sync_table_columns(session, updated_table, table) return True @@ -300,7 +300,7 @@ def update_existing_tables_status(existing_tables, glue_tables): @staticmethod def sync_table_columns(session, dataset_table, glue_table): - DatasetTable.delete_all_table_columns(session, dataset_table) + DatasetTableService.delete_all_table_columns(session, dataset_table) columns = [ {**item, **{'columnType': 'column'}} diff --git a/backend/dataall/tasks/subscriptions/subscription_service.py b/backend/dataall/tasks/subscriptions/subscription_service.py index 7b2a6d461..bf7eded35 100644 --- a/backend/dataall/tasks/subscriptions/subscription_service.py +++ b/backend/dataall/tasks/subscriptions/subscription_service.py @@ -14,7 +14,7 @@ from ...db import models from ...tasks.subscriptions import poll_queues from ...utils import json_utils -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() root.setLevel(logging.INFO) @@ -65,7 +65,7 @@ def notify_consumers(engine, messages): @staticmethod def publish_table_update_message(engine, message): with engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTableService.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), @@ -136,7 +136,7 @@ def publish_location_update_message(session, message): @staticmethod def store_dataquality_results(session, message): - table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTableService.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), diff --git a/backend/dataall/tasks/tables_syncer.py b/backend/dataall/tasks/tables_syncer.py index 5e6e8d48e..7d2781ccf 100644 --- a/backend/dataall/tasks/tables_syncer.py +++ b/backend/dataall/tasks/tables_syncer.py @@ -13,7 +13,7 @@ connect, ) from ..utils.alarm_service import AlarmService -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() root.setLevel(logging.INFO) @@ -64,7 +64,7 @@ def sync_tables(engine, es=None): f'Found {len(tables)} tables on Glue database {dataset.GlueDatabaseName}' ) - DatasetTable.sync( + DatasetTableService.sync( session, dataset.datasetUri, glue_tables=tables ) diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index af27529d5..82548252b 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -3,7 +3,7 @@ import pytest import dataall -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService @pytest.fixture(scope='module', autouse=True) @@ -290,7 +290,7 @@ def test_sync_tables_and_columns(client, table, dataset1, db): }, ] - assert DatasetTable.sync(session, dataset1.datasetUri, glue_tables) + assert DatasetTableService.sync(session, dataset1.datasetUri, glue_tables) new_table: dataall.db.models.DatasetTable = ( session.query(dataall.db.models.DatasetTable) .filter(dataall.db.models.DatasetTable.name == 'new_table') From b7922ed51c674e210c66a0bd298dae099c9eface Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:19:02 +0200 Subject: [PATCH 08/67] Dataset refactoring Moved DatasetTableColumn to modules --- backend/dataall/api/Objects/Feed/resolvers.py | 5 ++-- .../dataall/api/Objects/Glossary/resolvers.py | 5 ++-- backend/dataall/aws/handlers/glue.py | 5 ++-- backend/dataall/db/api/glossary.py | 11 +++---- backend/dataall/db/models/__init__.py | 1 - .../datasets/api/table_column/resolvers.py | 21 +++++++------- .../dataall/modules/datasets/db/__init__.py | 1 + .../datasets/db/table_column_model.py} | 4 +-- .../datasets/services/dataset_table.py | 9 +++--- tests/api/test_dataset_table.py | 29 ++++++++++--------- tests/api/test_glossary.py | 5 ++-- 11 files changed, 52 insertions(+), 44 deletions(-) create mode 100644 backend/dataall/modules/datasets/db/__init__.py rename backend/dataall/{db/models/DatasetTableColumn.py => modules/datasets/db/table_column_model.py} (91%) diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index a6c0535de..cbde23f0d 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,6 +2,7 @@ from ....api.context import Context from ....db import paginate, models +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn class Feed: @@ -19,7 +20,7 @@ def targetType(self): def resolve_feed_target_type(obj, *_): - if isinstance(obj, models.DatasetTableColumn): + if isinstance(obj, DatasetTableColumn): return 'DatasetTableColumn' elif isinstance(obj, models.Worksheet): return 'Worksheet' @@ -44,7 +45,7 @@ def resolve_target(context: Context, source: Feed, **kwargs): model = { 'Dataset': models.Dataset, 'DatasetTable': models.DatasetTable, - 'DatasetTableColumn': models.DatasetTableColumn, + 'DatasetTableColumn': DatasetTableColumn, 'DatasetStorageLocation': models.DatasetStorageLocation, 'Dashboard': models.Dashboard, 'DataPipeline': models.DataPipeline, diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index 801bd27dc..847f16ac6 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -11,6 +11,7 @@ from ....api.constants import ( GlossaryRole ) +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def resolve_glossary_node(obj: models.GlossaryNode, *_): @@ -322,7 +323,7 @@ def get_link(context: Context, source, linkUri: str = None): def target_union_resolver(obj, *_): - if isinstance(obj, models.DatasetTableColumn): + if isinstance(obj, DatasetTableColumn): return 'DatasetTableColumn' elif isinstance(obj, models.DatasetTable): return 'DatasetTable' @@ -341,7 +342,7 @@ def resolve_link_target(context, source, **kwargs): model = { 'Dataset': models.Dataset, 'DatasetTable': models.DatasetTable, - 'Column': models.DatasetTableColumn, + 'Column': DatasetTableColumn, 'DatasetStorageLocation': models.DatasetStorageLocation, 'Dashboard': models.Dashboard, }[source.targetType] diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index 88f68fc84..ca00a81f5 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -7,6 +7,7 @@ from ... import db from ...db import models from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn log = logging.getLogger('aws:glue') @@ -525,8 +526,8 @@ def _update_existing_crawler(glue, accountid, crawler_name, targets, database): @Worker.handler('glue.table.update_column') def update_table_columns(engine, task: models.Task): with engine.scoped_session() as session: - column: models.DatasetTableColumn = session.query( - models.DatasetTableColumn + column: DatasetTableColumn = session.query( + DatasetTableColumn ).get(task.targetUri) table: models.DatasetTable = session.query(models.DatasetTable).get( column.tableUri diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index 1616141c8..c6313e007 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -9,6 +9,7 @@ has_tenant_perm, ) from ..models.Glossary import GlossaryNodeStatus +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn logger = logging.getLogger(__name__) @@ -133,7 +134,7 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): elif targetType == 'Folder': target = session.query(models.DatasetStorageLocation).get(targetUri) elif targetType == 'Column': - target = session.query(models.DatasetTableColumn).get(targetUri) + target = session.query(DatasetTableColumn).get(targetUri) elif targetType == 'Dashboard': target = session.query(models.Dashboard).get(targetUri) else: @@ -361,11 +362,11 @@ def list_term_associations( models.DatasetTable.description.label('description'), ) columns = session.query( - models.DatasetTableColumn.columnUri.label('targetUri'), + DatasetTableColumn.columnUri.label('targetUri'), literal('column').label('targetType'), - models.DatasetTableColumn.label.label('label'), - models.DatasetTableColumn.name.label('name'), - models.DatasetTableColumn.description.label('description'), + DatasetTableColumn.label.label('label'), + DatasetTableColumn.name.label('name'), + DatasetTableColumn.description.label('description'), ) folders = session.query( models.DatasetStorageLocation.locationUri.label('targetUri'), diff --git a/backend/dataall/db/models/__init__.py b/backend/dataall/db/models/__init__.py index 1ce567c87..1ab4134b3 100644 --- a/backend/dataall/db/models/__init__.py +++ b/backend/dataall/db/models/__init__.py @@ -9,7 +9,6 @@ from .DatasetQualityRule import DatasetQualityRule from .DatasetStorageLocation import DatasetStorageLocation from .DatasetTable import DatasetTable -from .DatasetTableColumn import DatasetTableColumn from .DatasetTableProfilingJob import DatasetTableProfilingJob from .Environment import Environment from .EnvironmentGroup import EnvironmentGroup diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index e5e7fd60c..b958f2f7a 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -6,6 +6,7 @@ from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def list_table_columns( @@ -22,21 +23,21 @@ def list_table_columns( if not source: source = DatasetTableService.get_dataset_table_by_uri(session, tableUri) q = ( - session.query(models.DatasetTableColumn) + session.query(DatasetTableColumn) .filter( - models.DatasetTableColumn.tableUri == tableUri, - models.DatasetTableColumn.deleted.is_(None), + DatasetTableColumn.tableUri == tableUri, + DatasetTableColumn.deleted.is_(None), ) - .order_by(models.DatasetTableColumn.columnType.asc()) + .order_by(DatasetTableColumn.columnType.asc()) ) term = filter.get('term') if term: q = q.filter( or_( - models.DatasetTableColumn.label.ilike('%' + term + '%'), - models.DatasetTableColumn.description.ilike('%' + term + '%'), + DatasetTableColumn.label.ilike('%' + term + '%'), + DatasetTableColumn.description.ilike('%' + term + '%'), ) - ).order_by(models.DatasetTableColumn.columnType.asc()) + ).order_by(DatasetTableColumn.columnType.asc()) return paginate( q, page=filter.get('page', 1), page_size=filter.get('pageSize', 65) @@ -61,7 +62,7 @@ def sync_table_columns(context: Context, source, tableUri: str = None): return list_table_columns(context, source=table, tableUri=tableUri) -def resolve_terms(context, source: models.DatasetTableColumn, **kwargs): +def resolve_terms(context, source: DatasetTableColumn, **kwargs): if not source: return None with context.engine.scoped_session() as session: @@ -75,8 +76,8 @@ def update_table_column( context: Context, source, columnUri: str = None, input: dict = None ): with context.engine.scoped_session() as session: - column: models.DatasetTableColumn = session.query( - models.DatasetTableColumn + column: DatasetTableColumn = session.query( + DatasetTableColumn ).get(columnUri) if not column: raise db.exceptions.ObjectNotFound('Column', columnUri) diff --git a/backend/dataall/modules/datasets/db/__init__.py b/backend/dataall/modules/datasets/db/__init__.py new file mode 100644 index 000000000..104b49a42 --- /dev/null +++ b/backend/dataall/modules/datasets/db/__init__.py @@ -0,0 +1 @@ +"""Database logic for datasets""" diff --git a/backend/dataall/db/models/DatasetTableColumn.py b/backend/dataall/modules/datasets/db/table_column_model.py similarity index 91% rename from backend/dataall/db/models/DatasetTableColumn.py rename to backend/dataall/modules/datasets/db/table_column_model.py index f4fe1f7d6..4d3d7e009 100644 --- a/backend/dataall/db/models/DatasetTableColumn.py +++ b/backend/dataall/modules/datasets/db/table_column_model.py @@ -1,7 +1,7 @@ from sqlalchemy import Column, String -from .. import Base -from .. import Resource, utils +from dataall.db import Base +from dataall.db import Resource, utils class DatasetTableColumn(Resource, Base): diff --git a/backend/dataall/modules/datasets/services/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py index eeeb99f1d..873cbe01e 100644 --- a/backend/dataall/modules/datasets/services/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -6,6 +6,7 @@ from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment from dataall.db.models import Dataset from dataall.utils import json_utils +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn logger = logging.getLogger(__name__) @@ -315,7 +316,7 @@ def sync_table_columns(session, dataset_table, glue_table): logger.debug(f'Found partitions {partitions} for table {dataset_table}') for col in columns + partitions: - table_col = models.DatasetTableColumn( + table_col = DatasetTableColumn( name=col['Name'], description=col.get('Comment', 'No description provided'), label=col['Name'], @@ -333,11 +334,11 @@ def sync_table_columns(session, dataset_table, glue_table): @staticmethod def delete_all_table_columns(session, dataset_table): - session.query(models.DatasetTableColumn).filter( + session.query(DatasetTableColumn).filter( and_( - models.DatasetTableColumn.GlueDatabaseName + DatasetTableColumn.GlueDatabaseName == dataset_table.GlueDatabaseName, - models.DatasetTableColumn.GlueTableName == dataset_table.GlueTableName, + DatasetTableColumn.GlueTableName == dataset_table.GlueTableName, ) ).delete() session.commit() diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index 82548252b..a2fcb2add 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -4,6 +4,7 @@ import dataall from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn @pytest.fixture(scope='module', autouse=True) @@ -112,7 +113,7 @@ def test_add_columns(table, dataset1, db): .filter(dataall.db.models.DatasetTable.name == 'table1') .first() ) - table_col = dataall.db.models.DatasetTableColumn( + table_col = DatasetTableColumn( name='col1', description='None', label='col1', @@ -186,8 +187,8 @@ def test_update_dataset_table_column(client, table, dataset1, db): .first() ) column = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == table.tableUri) + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == table.tableUri) .first() ) response = client.query( @@ -208,7 +209,7 @@ def test_update_dataset_table_column(client, table, dataset1, db): response.data.updateDatasetTableColumn.description == 'My new description' ) - column = session.query(dataall.db.models.DatasetTableColumn).get( + column = session.query(DatasetTableColumn).get( column.columnUri ) assert column.description == 'My new description' @@ -235,8 +236,8 @@ def test_sync_tables_and_columns(client, table, dataset1, db): .first() ) column = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == table.tableUri) + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == table.tableUri) .first() ) glue_tables = [ @@ -298,10 +299,10 @@ def test_sync_tables_and_columns(client, table, dataset1, db): ) assert new_table assert new_table.GlueTableName == 'new_table' - columns: [dataall.db.models.DatasetTableColumn] = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == new_table.tableUri) - .order_by(dataall.db.models.DatasetTableColumn.columnType.asc()) + columns: [DatasetTableColumn] = ( + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == new_table.tableUri) + .order_by(DatasetTableColumn.columnType.asc()) .all() ) assert len(columns) == 2 @@ -315,10 +316,10 @@ def test_sync_tables_and_columns(client, table, dataset1, db): ) assert existing_table assert existing_table.GlueTableName == 'table1' - columns: [dataall.db.models.DatasetTableColumn] = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == new_table.tableUri) - .order_by(dataall.db.models.DatasetTableColumn.columnType.asc()) + columns: [DatasetTableColumn] = ( + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == new_table.tableUri) + .order_by(DatasetTableColumn.columnType.asc()) .all() ) assert len(columns) == 2 diff --git a/tests/api/test_glossary.py b/tests/api/test_glossary.py index 157c6cd2c..8276dca8c 100644 --- a/tests/api/test_glossary.py +++ b/tests/api/test_glossary.py @@ -1,5 +1,6 @@ from typing import List from dataall.db import models +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn import pytest @@ -48,11 +49,11 @@ def _table(db, _dataset) -> models.DatasetTable: @pytest.fixture(scope='module', autouse=True) -def _columns(db, _dataset, _table) -> List[models.DatasetTableColumn]: +def _columns(db, _dataset, _table) -> List[DatasetTableColumn]: with db.scoped_session() as session: cols = [] for i in range(0, 10): - c = models.DatasetTableColumn( + c = DatasetTableColumn( datasetUri=_dataset.datasetUri, tableUri=_table.tableUri, label=f'c{i+1}', From 1771bcaa4fc590eda8ca01537e544b2e1f5fa317 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:28:27 +0200 Subject: [PATCH 09/67] Notebooks doesn't require tasks --- backend/dataall/modules/notebooks/tasks/__init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 backend/dataall/modules/notebooks/tasks/__init__.py diff --git a/backend/dataall/modules/notebooks/tasks/__init__.py b/backend/dataall/modules/notebooks/tasks/__init__.py deleted file mode 100644 index 7da194e3b..000000000 --- a/backend/dataall/modules/notebooks/tasks/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Currently notebooks don't have tasks, but this module needed for correct loading""" From 3d1603f21806bbaa099d70f9af072c5f2d40a5e4 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:31:07 +0200 Subject: [PATCH 10/67] Renamed tasks to handlers Currently, only async handlers require dedicated loading. Long-running tasks (scheduled tasks) might not need to have a dedicated loading mode --- backend/aws_handler.py | 2 +- backend/dataall/modules/loader.py | 8 ++++---- backend/local_graphql_server.py | 2 +- tests/conftest.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/backend/aws_handler.py b/backend/aws_handler.py index 872c8f433..ec5382b6f 100644 --- a/backend/aws_handler.py +++ b/backend/aws_handler.py @@ -14,7 +14,7 @@ engine = get_engine(envname=ENVNAME) -load_modules(modes=[ImportMode.TASKS]) +load_modules(modes=[ImportMode.HANDLERS]) def handler(event, context=None): diff --git a/backend/dataall/modules/loader.py b/backend/dataall/modules/loader.py index aa4a656d4..9fa3c69bf 100644 --- a/backend/dataall/modules/loader.py +++ b/backend/dataall/modules/loader.py @@ -2,7 +2,7 @@ import importlib import logging from abc import ABC, abstractmethod -from enum import Enum +from enum import Enum, auto from typing import List from dataall.core.config import config @@ -19,9 +19,9 @@ class ImportMode(Enum): of functionality to be loaded, there should be different loading modes """ - API = "api" - CDK = "cdk" - TASKS = "tasks" + API = auto() + CDK = auto() + HANDLERS = auto() class ModuleInterface(ABC): diff --git a/backend/local_graphql_server.py b/backend/local_graphql_server.py index 3783ba0a3..44f79a087 100644 --- a/backend/local_graphql_server.py +++ b/backend/local_graphql_server.py @@ -30,7 +30,7 @@ es = connect(envname=ENVNAME) logger.info('Connected') # create_schema_and_tables(engine, envname=ENVNAME) -load_modules(modes=[ImportMode.API, ImportMode.TASKS]) +load_modules(modes=[ImportMode.API, ImportMode.HANDLERS]) Base.metadata.create_all(engine.engine) CDKPROXY_URL = ( 'http://cdkproxy:2805' if ENVNAME == 'dkrcompose' else 'http://localhost:2805' diff --git a/tests/conftest.py b/tests/conftest.py index a67d6bd41..2767a66a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,7 @@ import dataall from dataall.modules.loader import load_modules, ImportMode -load_modules(modes=[ImportMode.TASKS, ImportMode.API, ImportMode.CDK]) +load_modules(modes=[ImportMode.HANDLERS, ImportMode.API, ImportMode.CDK]) ENVNAME = os.environ.get('envname', 'pytest') From fb6b515103927e80f96d8248ddb95568a08b7f7c Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 16:00:18 +0200 Subject: [PATCH 11/67] Dataset refactoring Extracted code from glue to glue_column_handler Added handlers importing for datasets --- backend/dataall/aws/handlers/glue.py | 99 --------------- backend/dataall/db/api/redshift_cluster.py | 6 +- backend/dataall/modules/datasets/__init__.py | 15 ++- .../modules/datasets/handlers/__init__.py | 8 ++ .../datasets/handlers/glue_column_handler.py | 113 ++++++++++++++++++ 5 files changed, 138 insertions(+), 103 deletions(-) create mode 100644 backend/dataall/modules/datasets/handlers/__init__.py create mode 100644 backend/dataall/modules/datasets/handlers/glue_column_handler.py diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index ca00a81f5..4bfda7ce3 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -7,7 +7,6 @@ from ... import db from ...db import models from dataall.modules.datasets.services.dataset_table import DatasetTableService -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn log = logging.getLogger('aws:glue') @@ -522,104 +521,6 @@ def _update_existing_crawler(glue, accountid, crawler_name, targets, database): else: raise e - @staticmethod - @Worker.handler('glue.table.update_column') - def update_table_columns(engine, task: models.Task): - with engine.scoped_session() as session: - column: DatasetTableColumn = session.query( - DatasetTableColumn - ).get(task.targetUri) - table: models.DatasetTable = session.query(models.DatasetTable).get( - column.tableUri - ) - try: - aws_session = SessionHelper.remote_session(table.AWSAccountId) - - Glue.grant_pivot_role_all_table_permissions(aws_session, table) - - glue_client = aws_session.client('glue', region_name=table.region) - - original_table = glue_client.get_table( - CatalogId=table.AWSAccountId, - DatabaseName=table.GlueDatabaseName, - Name=table.name, - ) - updated_table = { - k: v - for k, v in original_table['Table'].items() - if k - not in [ - 'CatalogId', - 'VersionId', - 'DatabaseName', - 'CreateTime', - 'UpdateTime', - 'CreatedBy', - 'IsRegisteredWithLakeFormation', - ] - } - all_columns = updated_table.get('StorageDescriptor', {}).get( - 'Columns', [] - ) + updated_table.get('PartitionKeys', []) - for col in all_columns: - if col['Name'] == column.name: - col['Comment'] = column.description - log.info( - f'Found column {column.name} adding description {column.description}' - ) - response = glue_client.update_table( - DatabaseName=table.GlueDatabaseName, - TableInput=updated_table, - ) - log.info( - f'Column {column.name} updated successfully: {response}' - ) - return True - - except ClientError as e: - log.error( - f'Failed to update table column {column.name} description: {e}' - ) - raise e - - @staticmethod - def grant_pivot_role_all_table_permissions(aws_session, table): - """ - Pivot role needs to have all permissions - for tables managed inside dataall - :param aws_session: - :param table: - :return: - """ - try: - lf_client = aws_session.client('lakeformation', region_name=table.region) - grant_dict = dict( - Principal={ - 'DataLakePrincipalIdentifier': SessionHelper.get_delegation_role_arn( - table.AWSAccountId - ) - }, - Resource={ - 'Table': { - 'DatabaseName': table.GlueDatabaseName, - 'Name': table.name, - } - }, - Permissions=['SELECT', 'ALTER', 'DROP', 'INSERT'], - ) - response = lf_client.grant_permissions(**grant_dict) - log.error( - f'Successfully granted pivot role all table ' - f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' - f'access: {response}' - ) - except ClientError as e: - log.error( - f'Failed to grant pivot role all table ' - f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' - f'access: {e}' - ) - raise e @staticmethod @Worker.handler('glue.table.columns') diff --git a/backend/dataall/db/api/redshift_cluster.py b/backend/dataall/db/api/redshift_cluster.py index 91e687d2b..4167a555a 100644 --- a/backend/dataall/db/api/redshift_cluster.py +++ b/backend/dataall/db/api/redshift_cluster.py @@ -3,13 +3,13 @@ from sqlalchemy import and_, or_, literal from .. import models, api, exceptions, paginate, permissions -from . import has_resource_perm, ResourcePolicy, DatasetTable, Environment, Dataset -from ..models.Enums import ShareItemStatus +from . import has_resource_perm, ResourcePolicy, Environment, Dataset from ...utils.naming_convention import ( NamingConventionService, NamingConventionPattern, ) from ...utils.slugify import slugify +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -495,7 +495,7 @@ def enable_copy_table( session, username, groups, uri, data=None, check_perm=True ) -> models.RedshiftClusterDatasetTable: cluster = RedshiftCluster.get_redshift_cluster_by_uri(session, uri) - table = DatasetTable.get_dataset_table_by_uri(session, data['tableUri']) + table = DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) table = models.RedshiftClusterDatasetTable( clusterUri=uri, datasetUri=data['datasetUri'], diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 67298a06e..cd52bc207 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -1,12 +1,14 @@ """Contains the code related to datasets""" import logging +from typing import List + from dataall.modules.loader import ModuleInterface, ImportMode log = logging.getLogger(__name__) class DatasetApiModuleInterface(ModuleInterface): - """Implements ModuleInterface for notebook GraphQl lambda""" + """Implements ModuleInterface for dataset GraphQl lambda""" @classmethod def is_supported(cls, modes): @@ -16,3 +18,14 @@ def __init__(self): import dataall.modules.datasets.api log.info("API of datasets has been imported") + +class DatasetAsyncHandlersModuleInterface(ModuleInterface): + """Implements ModuleInterface for dataset async lambda""" + + @classmethod + def is_supported(cls, modes: List[ImportMode]): + return ImportMode.HANDLERS in modes + + def __init__(self): + import dataall.modules.datasets.handlers + log.info("Dataset handlers have been imported") diff --git a/backend/dataall/modules/datasets/handlers/__init__.py b/backend/dataall/modules/datasets/handlers/__init__.py new file mode 100644 index 000000000..7ed90c729 --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/__init__.py @@ -0,0 +1,8 @@ +""" +Contains code with the handlers that are need for async +processing in a separate lambda function +""" +from dataall.modules.datasets.handlers import glue_column_handler + +__all__ = ["glue_column_handler"] + diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py new file mode 100644 index 000000000..e7f8d358b --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -0,0 +1,113 @@ +import logging + +from botocore.exceptions import ClientError + +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import models +from dataall.aws.handlers.service_handlers import Worker +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn + +log = logging.getLogger(__name__) + + +class DatasetColumnGlueHandler: + """A handler for dataset table columns""" + + @staticmethod + @Worker.handler('glue.table.update_column') + def update_table_columns(engine, task: models.Task): + with engine.scoped_session() as session: + column: DatasetTableColumn = session.query( + DatasetTableColumn + ).get(task.targetUri) + table: models.DatasetTable = session.query(models.DatasetTable).get( + column.tableUri + ) + try: + aws_session = SessionHelper.remote_session(table.AWSAccountId) + + DatasetColumnGlueHandler.grant_pivot_role_all_table_permissions(aws_session, table) + + glue_client = aws_session.client('glue', region_name=table.region) + + original_table = glue_client.get_table( + CatalogId=table.AWSAccountId, + DatabaseName=table.GlueDatabaseName, + Name=table.name, + ) + updated_table = { + k: v + for k, v in original_table['Table'].items() + if k + not in [ + 'CatalogId', + 'VersionId', + 'DatabaseName', + 'CreateTime', + 'UpdateTime', + 'CreatedBy', + 'IsRegisteredWithLakeFormation', + ] + } + all_columns = updated_table.get('StorageDescriptor', {}).get( + 'Columns', [] + ) + updated_table.get('PartitionKeys', []) + for col in all_columns: + if col['Name'] == column.name: + col['Comment'] = column.description + log.info( + f'Found column {column.name} adding description {column.description}' + ) + response = glue_client.update_table( + DatabaseName=table.GlueDatabaseName, + TableInput=updated_table, + ) + log.info( + f'Column {column.name} updated successfully: {response}' + ) + return True + + except ClientError as e: + log.error( + f'Failed to update table column {column.name} description: {e}' + ) + raise e + + @staticmethod + def grant_pivot_role_all_table_permissions(aws_session, table): + """ + Pivot role needs to have all permissions + for tables managed inside dataall + :param aws_session: + :param table: + :return: + """ + try: + lf_client = aws_session.client('lakeformation', region_name=table.region) + grant_dict = dict( + Principal={ + 'DataLakePrincipalIdentifier': SessionHelper.get_delegation_role_arn( + table.AWSAccountId + ) + }, + Resource={ + 'Table': { + 'DatabaseName': table.GlueDatabaseName, + 'Name': table.name, + } + }, + Permissions=['SELECT', 'ALTER', 'DROP', 'INSERT'], + ) + response = lf_client.grant_permissions(**grant_dict) + log.error( + f'Successfully granted pivot role all table ' + f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' + f'access: {response}' + ) + except ClientError as e: + log.error( + f'Failed to grant pivot role all table ' + f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' + f'access: {e}' + ) + raise e From e3596a553a42d42baddcb3d36bd2b71b60f101a2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 17:18:46 +0200 Subject: [PATCH 12/67] Dataset refactoring Extracted the code for dataset table handler --- backend/dataall/aws/handlers/glue.py | 15 ---------- .../modules/datasets/handlers/__init__.py | 7 +++-- .../datasets/handlers/glue_table_handler.py | 30 +++++++++++++++++++ 3 files changed, 35 insertions(+), 17 deletions(-) create mode 100644 backend/dataall/modules/datasets/handlers/glue_table_handler.py diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index 4bfda7ce3..cc8c5cfc7 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -73,21 +73,6 @@ def database_exists(**data): log.info(f'Database {database} does not exist on account {accountid}...') return False - @staticmethod - @Worker.handler(path='glue.dataset.database.tables') - def list_tables(engine, task: models.Task): - with engine.scoped_session() as session: - dataset: models.Dataset = db.api.Dataset.get_dataset_by_uri( - session, task.targetUri - ) - accountid = dataset.AwsAccountId - region = dataset.region - tables = Glue.list_glue_database_tables( - accountid, dataset.GlueDatabaseName, region - ) - DatasetTableService.sync(session, dataset.datasetUri, glue_tables=tables) - return tables - @staticmethod def list_glue_database_tables(accountid, database, region): aws_session = SessionHelper.remote_session(accountid=accountid) diff --git a/backend/dataall/modules/datasets/handlers/__init__.py b/backend/dataall/modules/datasets/handlers/__init__.py index 7ed90c729..19bd47297 100644 --- a/backend/dataall/modules/datasets/handlers/__init__.py +++ b/backend/dataall/modules/datasets/handlers/__init__.py @@ -2,7 +2,10 @@ Contains code with the handlers that are need for async processing in a separate lambda function """ -from dataall.modules.datasets.handlers import glue_column_handler +from dataall.modules.datasets.handlers import ( + glue_column_handler, + glue_table_handler +) -__all__ = ["glue_column_handler"] +__all__ = ["glue_column_handler", "glue_table_handler"] diff --git a/backend/dataall/modules/datasets/handlers/glue_table_handler.py b/backend/dataall/modules/datasets/handlers/glue_table_handler.py new file mode 100644 index 000000000..9bb50c501 --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/glue_table_handler.py @@ -0,0 +1,30 @@ +import logging + +from botocore.exceptions import ClientError + +from dataall.aws.handlers.glue import Glue +from dataall.aws.handlers.service_handlers import Worker +from dataall.db import models +from dataall.db.api import Dataset +from dataall.modules.datasets.services.dataset_table import DatasetTableService + +log = logging.getLogger(__name__) + + +class DatasetColumnGlueHandler: + """A handler for dataset table""" + + @staticmethod + @Worker.handler(path='glue.dataset.database.tables') + def list_tables(engine, task: models.Task): + with engine.scoped_session() as session: + dataset: models.Dataset = Dataset.get_dataset_by_uri( + session, task.targetUri + ) + account_id = dataset.AwsAccountId + region = dataset.region + tables = Glue.list_glue_database_tables( + account_id, dataset.GlueDatabaseName, region + ) + DatasetTableService.sync(session, dataset.datasetUri, glue_tables=tables) + return tables From 3af2ecfb0f24342eea970b4f9fd4263dbb81fc2e Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 17:28:46 +0200 Subject: [PATCH 13/67] Dataset refactoring Extracted the long-running task for datasets --- .../dataall/modules/datasets/tasks/__init__.py | 1 + .../datasets}/tasks/tables_syncer.py | 18 ++++++++---------- backend/dataall/tasks/__init__.py | 1 - deploy/stacks/container.py | 2 +- tests/tasks/test_tables_sync.py | 4 ++-- 5 files changed, 12 insertions(+), 14 deletions(-) create mode 100644 backend/dataall/modules/datasets/tasks/__init__.py rename backend/dataall/{ => modules/datasets}/tasks/tables_syncer.py (92%) diff --git a/backend/dataall/modules/datasets/tasks/__init__.py b/backend/dataall/modules/datasets/tasks/__init__.py new file mode 100644 index 000000000..da597f309 --- /dev/null +++ b/backend/dataall/modules/datasets/tasks/__init__.py @@ -0,0 +1 @@ +"""Code of the long-running tasks that run in ECS""" diff --git a/backend/dataall/tasks/tables_syncer.py b/backend/dataall/modules/datasets/tasks/tables_syncer.py similarity index 92% rename from backend/dataall/tasks/tables_syncer.py rename to backend/dataall/modules/datasets/tasks/tables_syncer.py index 7d2781ccf..27a870d60 100644 --- a/backend/dataall/tasks/tables_syncer.py +++ b/backend/dataall/modules/datasets/tasks/tables_syncer.py @@ -3,16 +3,14 @@ import sys from operator import and_ -from .. import db -from ..aws.handlers.glue import Glue -from ..aws.handlers.sts import SessionHelper -from ..db import get_engine -from ..db import models -from ..searchproxy import indexers -from ..searchproxy.connect import ( - connect, -) -from ..utils.alarm_service import AlarmService +from dataall import db +from dataall.aws.handlers.glue import Glue +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import get_engine +from dataall.db import models +from dataall.searchproxy import indexers +from dataall.searchproxy.connect import connect +from dataall.utils.alarm_service import AlarmService from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() diff --git a/backend/dataall/tasks/__init__.py b/backend/dataall/tasks/__init__.py index 02ccaaa8b..89cb28e27 100644 --- a/backend/dataall/tasks/__init__.py +++ b/backend/dataall/tasks/__init__.py @@ -1,2 +1 @@ -from .tables_syncer import sync_tables from .catalog_indexer import index_objects diff --git a/deploy/stacks/container.py b/deploy/stacks/container.py index c313df82e..47fc3d114 100644 --- a/deploy/stacks/container.py +++ b/deploy/stacks/container.py @@ -94,7 +94,7 @@ def __init__( sync_tables_task = self.set_scheduled_task( cluster=cluster, - command=['python3.8', '-m', 'dataall.tasks.tables_syncer'], + command=['python3.8', '-m', 'dataall.modules.datasets.tasks.tables_syncer'], container_id=f'container', ecr_repository=ecr_repository, environment=self._create_env('INFO'), diff --git a/tests/tasks/test_tables_sync.py b/tests/tasks/test_tables_sync.py index 812dda1bd..d4e86b83f 100644 --- a/tests/tasks/test_tables_sync.py +++ b/tests/tasks/test_tables_sync.py @@ -147,14 +147,14 @@ def _test_tables_sync(db, org, env, sync_dataset, table, mocker): ], ) mocker.patch( - 'dataall.tasks.tables_syncer.is_assumable_pivot_role', return_value=True + 'dataall.modules.datasets.tables_syncer.is_assumable_pivot_role', return_value=True ) mocker.patch( 'dataall.aws.handlers.glue.Glue.grant_principals_all_table_permissions', return_value=True, ) - processed_tables = dataall.tasks.tables_syncer.sync_tables(engine=db) + processed_tables = dataall.modules.datasets.tasks.tables_syncer.sync_tables(engine=db) assert len(processed_tables) == 2 with db.scoped_session() as session: saved_table: dataall.db.models.DatasetTable = ( From 1a063b2cbe020ba5ffe72587902f0872ed788db4 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 18:34:12 +0200 Subject: [PATCH 14/67] Dataset refactoring Extracted the subscription service into datasets --- .../datasets/tasks}/subscription_service.py | 16 ++++++++-------- backend/dataall/tasks/subscriptions/__init__.py | 1 - deploy/stacks/container.py | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) rename backend/dataall/{tasks/subscriptions => modules/datasets/tasks}/subscription_service.py (97%) diff --git a/backend/dataall/tasks/subscriptions/subscription_service.py b/backend/dataall/modules/datasets/tasks/subscription_service.py similarity index 97% rename from backend/dataall/tasks/subscriptions/subscription_service.py rename to backend/dataall/modules/datasets/tasks/subscription_service.py index bf7eded35..8674f903a 100644 --- a/backend/dataall/tasks/subscriptions/subscription_service.py +++ b/backend/dataall/modules/datasets/tasks/subscription_service.py @@ -6,14 +6,14 @@ from botocore.exceptions import ClientError from sqlalchemy import and_ -from ... import db -from ...aws.handlers.service_handlers import Worker -from ...aws.handlers.sts import SessionHelper -from ...aws.handlers.sqs import SqsQueue -from ...db import get_engine -from ...db import models -from ...tasks.subscriptions import poll_queues -from ...utils import json_utils +from dataall import db +from dataall.aws.handlers.service_handlers import Worker +from dataall.aws.handlers.sts import SessionHelper +from dataall.aws.handlers.sqs import SqsQueue +from dataall.db import get_engine +from dataall.db import models +from dataall.tasks.subscriptions import poll_queues +from dataall.utils import json_utils from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() diff --git a/backend/dataall/tasks/subscriptions/__init__.py b/backend/dataall/tasks/subscriptions/__init__.py index f60ca5310..fa0214e42 100644 --- a/backend/dataall/tasks/subscriptions/__init__.py +++ b/backend/dataall/tasks/subscriptions/__init__.py @@ -1,2 +1 @@ from .sqs_poller import poll_queues -from .subscription_service import SubscriptionService diff --git a/deploy/stacks/container.py b/deploy/stacks/container.py index 47fc3d114..d3c761519 100644 --- a/deploy/stacks/container.py +++ b/deploy/stacks/container.py @@ -179,7 +179,7 @@ def __init__( command=[ 'python3.8', '-m', - 'dataall.tasks.subscriptions.subscription_service', + 'dataall.modules.datasets.tasks.subscription_service', ], container_id=f'container', ecr_repository=ecr_repository, From b7337142064d4b14a1e247eae5ff412f7db81448 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 18:39:54 +0200 Subject: [PATCH 15/67] Dataset refactoring Extracted the handler to get table columns --- backend/dataall/aws/handlers/glue.py | 30 ------------------- .../datasets/handlers/glue_column_handler.py | 29 ++++++++++++++++++ 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index cc8c5cfc7..e05ce4c54 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,7 +6,6 @@ from .sts import SessionHelper from ... import db from ...db import models -from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger('aws:glue') @@ -506,35 +505,6 @@ def _update_existing_crawler(glue, accountid, crawler_name, targets, database): else: raise e - - @staticmethod - @Worker.handler('glue.table.columns') - def get_table_columns(engine, task: models.Task): - with engine.scoped_session() as session: - dataset_table: models.DatasetTable = session.query(models.DatasetTable).get( - task.targetUri - ) - aws = SessionHelper.remote_session(dataset_table.AWSAccountId) - glue_client = aws.client('glue', region_name=dataset_table.region) - glue_table = {} - try: - glue_table = glue_client.get_table( - CatalogId=dataset_table.AWSAccountId, - DatabaseName=dataset_table.GlueDatabaseName, - Name=dataset_table.name, - ) - except glue_client.exceptions.ClientError as e: - log.error( - f'Failed to get table aws://{dataset_table.AWSAccountId}' - f'//{dataset_table.GlueDatabaseName}' - f'//{dataset_table.name} due to: ' - f'{e}' - ) - DatasetTableService.sync_table_columns( - session, dataset_table, glue_table['Table'] - ) - return True - @staticmethod @Worker.handler(path='glue.job.runs') def get_job_runs(engine, task: models.Task): diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index e7f8d358b..02003eea2 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -6,6 +6,7 @@ from dataall.db import models from dataall.aws.handlers.service_handlers import Worker from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -13,6 +14,34 @@ class DatasetColumnGlueHandler: """A handler for dataset table columns""" + @staticmethod + @Worker.handler('glue.table.columns') + def get_table_columns(engine, task: models.Task): + with engine.scoped_session() as session: + dataset_table: models.DatasetTable = session.query(models.DatasetTable).get( + task.targetUri + ) + aws = SessionHelper.remote_session(dataset_table.AWSAccountId) + glue_client = aws.client('glue', region_name=dataset_table.region) + glue_table = {} + try: + glue_table = glue_client.get_table( + CatalogId=dataset_table.AWSAccountId, + DatabaseName=dataset_table.GlueDatabaseName, + Name=dataset_table.name, + ) + except glue_client.exceptions.ClientError as e: + log.error( + f'Failed to get table aws://{dataset_table.AWSAccountId}' + f'//{dataset_table.GlueDatabaseName}' + f'//{dataset_table.name} due to: ' + f'{e}' + ) + DatasetTableService.sync_table_columns( + session, dataset_table, glue_table['Table'] + ) + return True + @staticmethod @Worker.handler('glue.table.update_column') def update_table_columns(engine, task: models.Task): From 2a4e2e09caf2e520118baa1ef22f1fb262541239 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 19:06:39 +0200 Subject: [PATCH 16/67] Extracted feed registry Needed for migration for modules --- backend/dataall/api/Objects/Feed/resolvers.py | 29 ++------------- backend/dataall/core/feed/__init__.py | 1 + .../dataall/core/feed/services/__init__.py | 1 + .../core/feed/services/feed_registry.py | 36 +++++++++++++++++++ backend/dataall/modules/datasets/__init__.py | 3 ++ 5 files changed, 44 insertions(+), 26 deletions(-) create mode 100644 backend/dataall/core/feed/__init__.py create mode 100644 backend/dataall/core/feed/services/__init__.py create mode 100644 backend/dataall/core/feed/services/feed_registry.py diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index cbde23f0d..0fff09053 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,7 +2,7 @@ from ....api.context import Context from ....db import paginate, models -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.core.feed.services.feed_registry import FeedRegistry class Feed: @@ -20,37 +20,14 @@ def targetType(self): def resolve_feed_target_type(obj, *_): - if isinstance(obj, DatasetTableColumn): - return 'DatasetTableColumn' - elif isinstance(obj, models.Worksheet): - return 'Worksheet' - elif isinstance(obj, models.DataPipeline): - return 'DataPipeline' - elif isinstance(obj, models.DatasetTable): - return 'DatasetTable' - elif isinstance(obj, models.Dataset): - return 'Dataset' - elif isinstance(obj, models.DatasetStorageLocation): - return 'DatasetStorageLocation' - elif isinstance(obj, models.Dashboard): - return 'Dashboard' - else: - return None + return FeedRegistry.find_by_model(obj) def resolve_target(context: Context, source: Feed, **kwargs): if not source: return None with context.engine.scoped_session() as session: - model = { - 'Dataset': models.Dataset, - 'DatasetTable': models.DatasetTable, - 'DatasetTableColumn': DatasetTableColumn, - 'DatasetStorageLocation': models.DatasetStorageLocation, - 'Dashboard': models.Dashboard, - 'DataPipeline': models.DataPipeline, - 'Worksheet': models.Worksheet, - }[source.targetType] + model = FeedRegistry.find(source.targetType) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py new file mode 100644 index 000000000..d06f5a78f --- /dev/null +++ b/backend/dataall/core/feed/__init__.py @@ -0,0 +1 @@ +"""Contains all code related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py new file mode 100644 index 000000000..e87be7564 --- /dev/null +++ b/backend/dataall/core/feed/services/__init__.py @@ -0,0 +1 @@ +"""Contains business logic of feed""" diff --git a/backend/dataall/core/feed/services/feed_registry.py b/backend/dataall/core/feed/services/feed_registry.py new file mode 100644 index 000000000..7a382c057 --- /dev/null +++ b/backend/dataall/core/feed/services/feed_registry.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass +from typing import Dict, Type +from dataall.db import Resource, models + + +@dataclass +class FeedDefinition: + target_type: str + model: Type[Resource] + + +class FeedRegistry: + """Registers feeds for different models""" + _DEFINITION: Dict[str, FeedDefinition] = {} + + @classmethod + def register(cls, feed: FeedDefinition): + cls._DEFINITION[feed.target_type] = feed + + @classmethod + def find(cls, target_type: str): + return cls._DEFINITION[target_type] + + @classmethod + def find_by_model(cls, obj: Resource): + for target_type, feed in cls._DEFINITION.items(): + if isinstance(obj, feed.model): + return target_type + return None + + +FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) +FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) +FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) +FeedRegistry.register(FeedDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index cd52bc207..0de251fee 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,6 +2,8 @@ import logging from typing import List +from dataall.core.feed.services.feed_registry import FeedRegistry, FeedDefinition +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode log = logging.getLogger(__name__) @@ -16,6 +18,7 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api + FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From c15d0902da9a10a846e2c1f0231f0f0fafc6c0f0 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 19:30:07 +0200 Subject: [PATCH 17/67] Extracted feed and glossary registry and created a model registry --- backend/dataall/api/Objects/Feed/resolvers.py | 6 +-- .../dataall/api/Objects/Glossary/resolvers.py | 23 ++------- backend/dataall/core/feed/__init__.py | 1 - .../dataall/core/feed/services/__init__.py | 1 - .../core/feed/services/feed_registry.py | 36 -------------- backend/dataall/core/utils/__init__.py | 1 + backend/dataall/core/utils/model_registry.py | 47 +++++++++++++++++++ backend/dataall/modules/datasets/__init__.py | 5 +- 8 files changed, 57 insertions(+), 63 deletions(-) delete mode 100644 backend/dataall/core/feed/__init__.py delete mode 100644 backend/dataall/core/feed/services/__init__.py delete mode 100644 backend/dataall/core/feed/services/feed_registry.py create mode 100644 backend/dataall/core/utils/__init__.py create mode 100644 backend/dataall/core/utils/model_registry.py diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index 0fff09053..1f328b1ae 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -1,8 +1,8 @@ from sqlalchemy import or_ -from ....api.context import Context -from ....db import paginate, models -from dataall.core.feed.services.feed_registry import FeedRegistry +from dataall.api.context import Context +from dataall.db import paginate, models +from dataall.core.utils.model_registry import FeedRegistry class Feed: diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index 847f16ac6..c6f2634d0 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -4,6 +4,7 @@ from .... import db from ....api.context import Context +from ....core.utils.model_registry import GlossaryRegistry from ....db import paginate, exceptions, models from ....searchproxy import upsert_dataset from ....searchproxy import upsert_table @@ -11,7 +12,6 @@ from ....api.constants import ( GlossaryRole ) -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def resolve_glossary_node(obj: models.GlossaryNode, *_): @@ -323,29 +323,12 @@ def get_link(context: Context, source, linkUri: str = None): def target_union_resolver(obj, *_): - if isinstance(obj, DatasetTableColumn): - return 'DatasetTableColumn' - elif isinstance(obj, models.DatasetTable): - return 'DatasetTable' - elif isinstance(obj, models.Dataset): - return 'Dataset' - elif isinstance(obj, models.DatasetStorageLocation): - return 'DatasetStorageLocation' - elif isinstance(obj, models.Dashboard): - return 'Dashboard' - else: - return None + return GlossaryRegistry.find_by_model(obj) def resolve_link_target(context, source, **kwargs): with context.engine.scoped_session() as session: - model = { - 'Dataset': models.Dataset, - 'DatasetTable': models.DatasetTable, - 'Column': DatasetTableColumn, - 'DatasetStorageLocation': models.DatasetStorageLocation, - 'Dashboard': models.Dashboard, - }[source.targetType] + model = GlossaryRegistry.find(source.targetUri) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py deleted file mode 100644 index d06f5a78f..000000000 --- a/backend/dataall/core/feed/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains all code related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py deleted file mode 100644 index e87be7564..000000000 --- a/backend/dataall/core/feed/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains business logic of feed""" diff --git a/backend/dataall/core/feed/services/feed_registry.py b/backend/dataall/core/feed/services/feed_registry.py deleted file mode 100644 index 7a382c057..000000000 --- a/backend/dataall/core/feed/services/feed_registry.py +++ /dev/null @@ -1,36 +0,0 @@ -from dataclasses import dataclass -from typing import Dict, Type -from dataall.db import Resource, models - - -@dataclass -class FeedDefinition: - target_type: str - model: Type[Resource] - - -class FeedRegistry: - """Registers feeds for different models""" - _DEFINITION: Dict[str, FeedDefinition] = {} - - @classmethod - def register(cls, feed: FeedDefinition): - cls._DEFINITION[feed.target_type] = feed - - @classmethod - def find(cls, target_type: str): - return cls._DEFINITION[target_type] - - @classmethod - def find_by_model(cls, obj: Resource): - for target_type, feed in cls._DEFINITION.items(): - if isinstance(obj, feed.model): - return target_type - return None - - -FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) -FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) -FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) -FeedRegistry.register(FeedDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) -FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/core/utils/__init__.py b/backend/dataall/core/utils/__init__.py new file mode 100644 index 000000000..02ed9cfb4 --- /dev/null +++ b/backend/dataall/core/utils/__init__.py @@ -0,0 +1 @@ +"""Utility functions and classes""" diff --git a/backend/dataall/core/utils/model_registry.py b/backend/dataall/core/utils/model_registry.py new file mode 100644 index 000000000..9a4c21952 --- /dev/null +++ b/backend/dataall/core/utils/model_registry.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass +from typing import Type, Dict + +from dataall.db import Resource, models + + +@dataclass +class ModelDefinition: + target_type: str + model: Type[Resource] + + +class ModelRegistry: + """Registers models for different target types""" + + def __init__(self): + self._definitions: Dict[str, ModelDefinition] = {} + + def register(self, model: ModelDefinition): + self._definitions[model.target_type] = model + + def find(self, target_type: str): + return self._definitions[target_type] + + def find_by_model(self, obj: Resource): + for target_type, definition in self._definitions.items(): + if isinstance(obj, definition.model): + return target_type + return None + + +# TODO should migrate to a proper file after the modularization +FeedRegistry = ModelRegistry() +GlossaryRegistry = ModelRegistry() + + +FeedRegistry.register(ModelDefinition("Worksheet", models.Worksheet)) +FeedRegistry.register(ModelDefinition("DataPipeline", models.DataPipeline)) +FeedRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) +FeedRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +FeedRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) + +GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +GlossaryRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) +GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(ModelDefinition("Dataset", models.Dataset)) diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 0de251fee..8f30bd897 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,7 +2,7 @@ import logging from typing import List -from dataall.core.feed.services.feed_registry import FeedRegistry, FeedDefinition +from dataall.core.utils.model_registry import ModelDefinition, FeedRegistry, GlossaryRegistry from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode @@ -18,7 +18,8 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api - FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) + FeedRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) + GlossaryRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From 052a2b1f33139dab09a8beb78d5a7cfb72387128 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 11:12:56 +0200 Subject: [PATCH 18/67] Dataset refactoring Fixed tests and added new for dataset module --- tests/core/test_config.py | 6 +++++- tests/tasks/test_subscriptions.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/core/test_config.py b/tests/core/test_config.py index 3222e4144..30f69f792 100644 --- a/tests/core/test_config.py +++ b/tests/core/test_config.py @@ -25,4 +25,8 @@ def test_default_config(): assert "notebooks" in modules assert "active" in modules["notebooks"] - assert config.get_property("modules.notebooks.active") == "true" + assert "datasets" in modules + assert "active" in modules["datasets"] + + assert config.get_property("modules.notebooks.active") + assert config.get_property("modules.datasets.active") diff --git a/tests/tasks/test_subscriptions.py b/tests/tasks/test_subscriptions.py index 25cd6178a..874b8ccab 100644 --- a/tests/tasks/test_subscriptions.py +++ b/tests/tasks/test_subscriptions.py @@ -134,10 +134,10 @@ def share( def test_subscriptions(org, env, otherenv, db, dataset, share, mocker): mocker.patch( - 'dataall.tasks.subscriptions.subscription_service.SubscriptionService.sns_call', + 'dataall.modules.datasets.tasks.subscription_service.SubscriptionService.sns_call', return_value=True, ) - subscriber = dataall.tasks.subscriptions.subscription_service.SubscriptionService() + subscriber = dataall.modules.datasets.tasks.subscription_service.SubscriptionService() messages = [ { 'prefix': 's3://dataset/testtable/csv/', From d9844834646aa7564104ef184ca95d41f0ecbbb2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 13:12:39 +0200 Subject: [PATCH 19/67] Fixed and unignored test_tables_sync --- tests/tasks/test_tables_sync.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/tasks/test_tables_sync.py b/tests/tasks/test_tables_sync.py index d4e86b83f..9d8282e65 100644 --- a/tests/tasks/test_tables_sync.py +++ b/tests/tasks/test_tables_sync.py @@ -92,7 +92,13 @@ def table(org, env, db, sync_dataset): yield table -def _test_tables_sync(db, org, env, sync_dataset, table, mocker): +@pytest.fixture(scope='module', autouse=True) +def permissions(db): + with db.scoped_session() as session: + yield dataall.db.api.Permission.init_permissions(session) + + +def test_tables_sync(db, org, env, sync_dataset, table, mocker): mocker.patch( 'dataall.aws.handlers.glue.Glue.list_glue_database_tables', return_value=[ @@ -147,7 +153,7 @@ def _test_tables_sync(db, org, env, sync_dataset, table, mocker): ], ) mocker.patch( - 'dataall.modules.datasets.tables_syncer.is_assumable_pivot_role', return_value=True + 'dataall.modules.datasets.tasks.tables_syncer.is_assumable_pivot_role', return_value=True ) mocker.patch( 'dataall.aws.handlers.glue.Glue.grant_principals_all_table_permissions', From dc0c9350b242be12238c8ce37b0dc74a018ed36d Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 14:10:09 +0200 Subject: [PATCH 20/67] Split model registry into feed and glossaries Glossaries had different target types and had to be treated differently --- backend/dataall/api/Objects/Feed/resolvers.py | 2 +- .../dataall/api/Objects/Glossary/resolvers.py | 7 +-- backend/dataall/core/feed/__init__.py | 1 + .../dataall/core/feed/services/__init__.py | 1 + .../dataall/core/feed/services/registry.py | 36 ++++++++++++++ backend/dataall/core/glossary/__init__.py | 1 + .../core/glossary/services/__init__.py | 1 + .../core/glossary/services/registry.py | 38 +++++++++++++++ backend/dataall/core/utils/model_registry.py | 47 ------------------- backend/dataall/db/api/glossary.py | 28 ++++------- backend/dataall/modules/datasets/__init__.py | 7 +-- 11 files changed, 97 insertions(+), 72 deletions(-) create mode 100644 backend/dataall/core/feed/__init__.py create mode 100644 backend/dataall/core/feed/services/__init__.py create mode 100644 backend/dataall/core/feed/services/registry.py create mode 100644 backend/dataall/core/glossary/__init__.py create mode 100644 backend/dataall/core/glossary/services/__init__.py create mode 100644 backend/dataall/core/glossary/services/registry.py delete mode 100644 backend/dataall/core/utils/model_registry.py diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index 1f328b1ae..598ec86e1 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,7 +2,7 @@ from dataall.api.context import Context from dataall.db import paginate, models -from dataall.core.utils.model_registry import FeedRegistry +from dataall.core.feed.services.registry import FeedRegistry class Feed: diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index c6f2634d0..ae8501993 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -4,7 +4,6 @@ from .... import db from ....api.context import Context -from ....core.utils.model_registry import GlossaryRegistry from ....db import paginate, exceptions, models from ....searchproxy import upsert_dataset from ....searchproxy import upsert_table @@ -13,6 +12,8 @@ GlossaryRole ) +from dataall.core.glossary.services.registry import GlossaryRegistry + def resolve_glossary_node(obj: models.GlossaryNode, *_): if obj.nodeType == 'G': @@ -323,12 +324,12 @@ def get_link(context: Context, source, linkUri: str = None): def target_union_resolver(obj, *_): - return GlossaryRegistry.find_by_model(obj) + return GlossaryRegistry.find_object_type(obj) def resolve_link_target(context, source, **kwargs): with context.engine.scoped_session() as session: - model = GlossaryRegistry.find(source.targetUri) + model = GlossaryRegistry.find_model(source.targetUri) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py new file mode 100644 index 000000000..39f751553 --- /dev/null +++ b/backend/dataall/core/feed/__init__.py @@ -0,0 +1 @@ +"""Contains logic related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py new file mode 100644 index 000000000..5b130b24b --- /dev/null +++ b/backend/dataall/core/feed/services/__init__.py @@ -0,0 +1 @@ +"""Service layer of feeds""" diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/core/feed/services/registry.py new file mode 100644 index 000000000..a69bcdd37 --- /dev/null +++ b/backend/dataall/core/feed/services/registry.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass +from typing import Type, Dict + +from dataall.db import Resource, models + + +@dataclass +class FeedDefinition: + target_type: str + model: Type[Resource] + + +class FeedRegistry: + """Registers models for different target types""" + + def __init__(self): + self._definitions: Dict[str, FeedDefinition] = {} + + def register(self, model: FeedDefinition): + self._definitions[model.target_type] = model + + def find(self, target_type: str): + return self._definitions[target_type] + + def find_by_model(self, obj: Resource): + for target_type, definition in self._definitions.items(): + if isinstance(obj, definition.model): + return target_type + return None + + +FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) +FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) +FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) +FeedRegistry.register(FeedDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/core/glossary/__init__.py b/backend/dataall/core/glossary/__init__.py new file mode 100644 index 000000000..aa81c1e26 --- /dev/null +++ b/backend/dataall/core/glossary/__init__.py @@ -0,0 +1 @@ +"""Contains code related to glossaries""" diff --git a/backend/dataall/core/glossary/services/__init__.py b/backend/dataall/core/glossary/services/__init__.py new file mode 100644 index 000000000..9ed65d261 --- /dev/null +++ b/backend/dataall/core/glossary/services/__init__.py @@ -0,0 +1 @@ +"""Service layer of glossaries""" diff --git a/backend/dataall/core/glossary/services/registry.py b/backend/dataall/core/glossary/services/registry.py new file mode 100644 index 000000000..7484087c4 --- /dev/null +++ b/backend/dataall/core/glossary/services/registry.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from typing import Type, Dict, Optional + +from dataall.db import Resource, models + + +@dataclass +class GlossaryDefinition: + target_type: str + object_type: str + model: Type[Resource] + + +class GlossaryRegistry: + _DEFINITIONS: Dict[str, GlossaryDefinition] = {} + + @classmethod + def register(cls, glossary: GlossaryDefinition) -> None: + cls._DEFINITIONS[glossary.target_type] = glossary + + @classmethod + def find_model(cls, target_type: str) -> Optional[Resource]: + definition = cls._DEFINITIONS[target_type] + return definition.model if definition is not None else None + + @classmethod + def find_object_type(cls, model: Resource) -> Optional[str]: + for _, definition in cls._DEFINITIONS.items(): + if isinstance(model, definition.model): + return definition.object_type + return None + + +GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", models.DatasetStorageLocation)) +GlossaryRegistry.register(GlossaryDefinition("Dashboard", "Dashboard", models.Dashboard)) +GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(GlossaryDefinition("Dataset", "Dataset", models.Dataset)) diff --git a/backend/dataall/core/utils/model_registry.py b/backend/dataall/core/utils/model_registry.py deleted file mode 100644 index 9a4c21952..000000000 --- a/backend/dataall/core/utils/model_registry.py +++ /dev/null @@ -1,47 +0,0 @@ -from dataclasses import dataclass -from typing import Type, Dict - -from dataall.db import Resource, models - - -@dataclass -class ModelDefinition: - target_type: str - model: Type[Resource] - - -class ModelRegistry: - """Registers models for different target types""" - - def __init__(self): - self._definitions: Dict[str, ModelDefinition] = {} - - def register(self, model: ModelDefinition): - self._definitions[model.target_type] = model - - def find(self, target_type: str): - return self._definitions[target_type] - - def find_by_model(self, obj: Resource): - for target_type, definition in self._definitions.items(): - if isinstance(obj, definition.model): - return target_type - return None - - -# TODO should migrate to a proper file after the modularization -FeedRegistry = ModelRegistry() -GlossaryRegistry = ModelRegistry() - - -FeedRegistry.register(ModelDefinition("Worksheet", models.Worksheet)) -FeedRegistry.register(ModelDefinition("DataPipeline", models.DataPipeline)) -FeedRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) -FeedRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) -FeedRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) - -GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) -GlossaryRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) -GlossaryRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) -GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) -GlossaryRegistry.register(ModelDefinition("Dataset", models.Dataset)) diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index c6313e007..3df8d34f7 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -4,12 +4,12 @@ from sqlalchemy import asc, or_, and_, literal, case from sqlalchemy.orm import with_expression, aliased -from .. import models, exceptions, permissions, paginate +from .. import models, exceptions, permissions, paginate, Resource from .permission_checker import ( has_tenant_perm, ) from ..models.Glossary import GlossaryNodeStatus -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.core.glossary.services.registry import GlossaryRegistry logger = logging.getLogger(__name__) @@ -124,24 +124,16 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): 'associations are allowed for Glossary terms only', ) - targetUri: str = data['targetUri'] - targetType: str = data['targetType'] - - if targetType == 'Dataset': - target = session.query(models.Dataset).get(targetUri) - elif targetType == 'DatasetTable': - target = session.query(models.DatasetTable).get(targetUri) - elif targetType == 'Folder': - target = session.query(models.DatasetStorageLocation).get(targetUri) - elif targetType == 'Column': - target = session.query(DatasetTableColumn).get(targetUri) - elif targetType == 'Dashboard': - target = session.query(models.Dashboard).get(targetUri) - else: + target_uri: str = data['targetUri'] + target_type: str = data['targetType'] + + target_model: Resource = GlossaryRegistry.find_model(target_type) + if not target_model: raise exceptions.InvalidInput( 'NodeType', 'term.nodeType', 'association target type is invalid' ) + target = session.query(target_model).get(target_uri) if not target: raise exceptions.ObjectNotFound('Association target', uri) @@ -150,8 +142,8 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): approvedByOwner=data.get('approvedByOwner', True), approvedBySteward=data.get('approvedBySteward', True), nodeUri=uri, - targetUri=targetUri, - targetType=targetType, + targetUri=target_uri, + targetType=target_type, ) session.add(link) return link diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 8f30bd897..306778f40 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,7 +2,8 @@ import logging from typing import List -from dataall.core.utils.model_registry import ModelDefinition, FeedRegistry, GlossaryRegistry +from dataall.core.feed.services.registry import FeedRegistry, FeedDefinition +from dataall.core.glossary.services.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode @@ -18,8 +19,8 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api - FeedRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) - GlossaryRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) + FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) + GlossaryRegistry.register(GlossaryDefinition("DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From 727e3537cfc3b133aa45c0deafc9568b25280b3a Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 15:40:08 +0200 Subject: [PATCH 21/67] Abstraction for glossaries Created API for glossaries to use modularization --- .../core/glossary/services/registry.py | 16 +++++- backend/dataall/db/api/environment.py | 2 +- backend/dataall/db/api/glossary.py | 57 +++++++------------ backend/dataall/db/api/organization.py | 1 - backend/dataall/db/models/Dashboard.py | 3 + backend/dataall/db/models/Dataset.py | 4 ++ .../db/models/DatasetStorageLocation.py | 3 + backend/dataall/db/models/DatasetTable.py | 3 + .../modules/datasets/db/table_column_model.py | 3 + 9 files changed, 50 insertions(+), 42 deletions(-) diff --git a/backend/dataall/core/glossary/services/registry.py b/backend/dataall/core/glossary/services/registry.py index 7484087c4..ee3f10d41 100644 --- a/backend/dataall/core/glossary/services/registry.py +++ b/backend/dataall/core/glossary/services/registry.py @@ -1,14 +1,22 @@ from dataclasses import dataclass -from typing import Type, Dict, Optional +from typing import Type, Dict, Optional, Protocol, Union from dataall.db import Resource, models +class Identifiable(Protocol): + def uri(self): + ... + + @dataclass class GlossaryDefinition: target_type: str object_type: str - model: Type[Resource] + model: Union[Type[Resource], Identifiable] # should be an intersection, but python typing doesn't have one yet + + def target_uri(self): + return self.model.uri() class GlossaryRegistry: @@ -30,6 +38,10 @@ def find_object_type(cls, model: Resource) -> Optional[str]: return definition.object_type return None + @classmethod + def definitions(cls): + return cls._DEFINITIONS.values() + GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", models.DatasetStorageLocation)) diff --git a/backend/dataall/db/api/environment.py b/backend/dataall/db/api/environment.py index 4a436bf9a..19d5de342 100644 --- a/backend/dataall/db/api/environment.py +++ b/backend/dataall/db/api/environment.py @@ -21,7 +21,7 @@ EnvironmentPermission, ) from ..models.Permission import PermissionType -from ..paginator import Page, paginate +from ..paginator import paginate from dataall.core.environment.models import EnvironmentParameter from ...core.environment.db.repositories import EnvironmentParameterRepository from ...utils.naming_convention import ( diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index 3df8d34f7..96c340f62 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -10,6 +10,7 @@ ) from ..models.Glossary import GlossaryNodeStatus from dataall.core.glossary.services.registry import GlossaryRegistry +from ..paginator import Page logger = logging.getLogger(__name__) @@ -339,46 +340,26 @@ def list_term_associations( ): source = data['source'] filter = data['filter'] - datasets = session.query( - models.Dataset.datasetUri.label('targetUri'), - literal('dataset').label('targetType'), - models.Dataset.label.label('label'), - models.Dataset.name.label('name'), - models.Dataset.description.label('description'), - ) - tables = session.query( - models.DatasetTable.tableUri.label('targetUri'), - literal('table').label('targetType'), - models.DatasetTable.label.label('label'), - models.DatasetTable.name.label('name'), - models.DatasetTable.description.label('description'), - ) - columns = session.query( - DatasetTableColumn.columnUri.label('targetUri'), - literal('column').label('targetType'), - DatasetTableColumn.label.label('label'), - DatasetTableColumn.name.label('name'), - DatasetTableColumn.description.label('description'), - ) - folders = session.query( - models.DatasetStorageLocation.locationUri.label('targetUri'), - literal('folder').label('targetType'), - models.DatasetStorageLocation.label.label('label'), - models.DatasetStorageLocation.name.label('name'), - models.DatasetStorageLocation.description.label('description'), - ) - dashboards = session.query( - models.Dashboard.dashboardUri.label('targetUri'), - literal('dashboard').label('targetType'), - models.Dashboard.label.label('label'), - models.Dashboard.name.label('name'), - models.Dashboard.description.label('description'), - ) + query = None + for definition in GlossaryRegistry.definitions(): + model = definition.model + subquery = session.query( + definition.target_uri().label('targetUri'), + literal(definition.target_type.lower()).label('targetType'), + model.label.label('label'), + model.name.label('name'), + model.description.label('description'), + ) + if query: + query.union(subquery) + else: + query = subquery - linked_objects = datasets.union(tables, columns, folders, dashboards).subquery( - 'linked_objects' - ) + if query is None: + return Page([], 1, 1, 0) # empty page. All modules are turned off + + linked_objects = query.subquery('linked_objects') path = models.GlossaryNode.path q = ( diff --git a/backend/dataall/db/api/organization.py b/backend/dataall/db/api/organization.py index 979dd1095..dd570eeae 100644 --- a/backend/dataall/db/api/organization.py +++ b/backend/dataall/db/api/organization.py @@ -8,7 +8,6 @@ from . import has_tenant_perm, ResourcePolicy, has_resource_perm from ..models import OrganizationGroup from ..models.Enums import OrganisationUserRole -from ..paginator import Page logger = logging.getLogger(__name__) diff --git a/backend/dataall/db/models/Dashboard.py b/backend/dataall/db/models/Dashboard.py index 1a24ef1cb..0b12ecd96 100644 --- a/backend/dataall/db/models/Dashboard.py +++ b/backend/dataall/db/models/Dashboard.py @@ -18,3 +18,6 @@ class Dashboard(Resource, Base): SamlGroupName = Column(String, nullable=False) userRoleForDashboard = query_expression() + + def uri(self): + return self.dashboardUri diff --git a/backend/dataall/db/models/Dataset.py b/backend/dataall/db/models/Dataset.py index 71a95fe0e..451c7da7c 100644 --- a/backend/dataall/db/models/Dataset.py +++ b/backend/dataall/db/models/Dataset.py @@ -59,3 +59,7 @@ class Dataset(Resource, Base): importedKmsKey = Column(Boolean, default=False) importedAdminRole = Column(Boolean, default=False) imported = Column(Boolean, default=False) + + def uri(self): + return self.datasetUri + diff --git a/backend/dataall/db/models/DatasetStorageLocation.py b/backend/dataall/db/models/DatasetStorageLocation.py index 33b121438..e21ae6694 100644 --- a/backend/dataall/db/models/DatasetStorageLocation.py +++ b/backend/dataall/db/models/DatasetStorageLocation.py @@ -17,3 +17,6 @@ class DatasetStorageLocation(Resource, Base): userRoleForStorageLocation = query_expression() projectPermission = query_expression() environmentEndPoint = query_expression() + + def uri(self): + return self.locationUri diff --git a/backend/dataall/db/models/DatasetTable.py b/backend/dataall/db/models/DatasetTable.py index a1b06b192..e97174167 100644 --- a/backend/dataall/db/models/DatasetTable.py +++ b/backend/dataall/db/models/DatasetTable.py @@ -27,3 +27,6 @@ class DatasetTable(Resource, Base): stage = Column(String, default='RAW') topics = Column(postgresql.ARRAY(String), nullable=True) confidentiality = Column(String, nullable=False, default='C1') + + def uri(self): + return self.tableUri diff --git a/backend/dataall/modules/datasets/db/table_column_model.py b/backend/dataall/modules/datasets/db/table_column_model.py index 4d3d7e009..05bc26058 100644 --- a/backend/dataall/modules/datasets/db/table_column_model.py +++ b/backend/dataall/modules/datasets/db/table_column_model.py @@ -18,3 +18,6 @@ class DatasetTableColumn(Resource, Base): columnType = Column( String, default='column' ) # can be either "column" or "partition" + + def uri(self): + return self.columnUri From 49fbb415dbdfc27c20d90366a1cf3bcd41ebea0f Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 15:47:09 +0200 Subject: [PATCH 22/67] Fixed leftovers --- .../dataall/core/feed/services/registry.py | 19 ++++++++++--------- backend/dataall/modules/datasets/__init__.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/core/feed/services/registry.py index a69bcdd37..07f2e77a1 100644 --- a/backend/dataall/core/feed/services/registry.py +++ b/backend/dataall/core/feed/services/registry.py @@ -12,18 +12,19 @@ class FeedDefinition: class FeedRegistry: """Registers models for different target types""" + _DEFINITIONS: Dict[str, FeedDefinition] = {} - def __init__(self): - self._definitions: Dict[str, FeedDefinition] = {} + @classmethod + def register(cls, model: FeedDefinition): + cls._DEFINITIONS[model.target_type] = model - def register(self, model: FeedDefinition): - self._definitions[model.target_type] = model + @classmethod + def find(cls, target_type: str): + return cls._DEFINITIONS[target_type] - def find(self, target_type: str): - return self._definitions[target_type] - - def find_by_model(self, obj: Resource): - for target_type, definition in self._definitions.items(): + @classmethod + def find_by_model(cls, obj: Resource): + for target_type, definition in cls._DEFINITIONS.items(): if isinstance(obj, definition.model): return target_type return None diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 306778f40..5b7224a48 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -20,7 +20,7 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) - GlossaryRegistry.register(GlossaryDefinition("DatasetTableColumn", DatasetTableColumn)) + GlossaryRegistry.register(GlossaryDefinition("Column", "DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From 7d029e73046990bae939f8ff5f6b6cfaa8a83d62 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 10:07:11 +0200 Subject: [PATCH 23/67] Datasets refactoring Added and fixed tests --- backend/dataall/core/feed/services/registry.py | 4 ++-- tests/api/test_feed.py | 3 ++- tests/modules/datasets/__init__.py | 0 tests/modules/datasets/test_dataset_feed.py | 11 +++++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 tests/modules/datasets/__init__.py create mode 100644 tests/modules/datasets/test_dataset_feed.py diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/core/feed/services/registry.py index 07f2e77a1..893f37f55 100644 --- a/backend/dataall/core/feed/services/registry.py +++ b/backend/dataall/core/feed/services/registry.py @@ -15,8 +15,8 @@ class FeedRegistry: _DEFINITIONS: Dict[str, FeedDefinition] = {} @classmethod - def register(cls, model: FeedDefinition): - cls._DEFINITIONS[model.target_type] = model + def register(cls, definition: FeedDefinition): + cls._DEFINITIONS[definition.target_type] = definition @classmethod def find(cls, target_type: str): diff --git a/tests/api/test_feed.py b/tests/api/test_feed.py index 11f7c4891..f1d8aaf7f 100644 --- a/tests/api/test_feed.py +++ b/tests/api/test_feed.py @@ -103,4 +103,5 @@ def test_get_target(client, worksheet): targetType='Worksheet', username='me', ) - print(response) + assert response.data.getFeed.target.worksheetUri == worksheet.worksheetUri + diff --git a/tests/modules/datasets/__init__.py b/tests/modules/datasets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/modules/datasets/test_dataset_feed.py b/tests/modules/datasets/test_dataset_feed.py new file mode 100644 index 000000000..52c97e990 --- /dev/null +++ b/tests/modules/datasets/test_dataset_feed.py @@ -0,0 +1,11 @@ + +from dataall.core.feed.services.registry import FeedRegistry +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn + + +def test_dataset_registered(): + model = FeedRegistry.find("DatasetTableColumn") + assert model == DatasetTableColumn + + model = DatasetTableColumn() + assert "DatasetTableColumn" == FeedRegistry.find_by_model(model) From be527ebc26e50f989ca3f9a8e84814d3d316a913 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 11:40:52 +0200 Subject: [PATCH 24/67] Added runtime type registration for Union GraphQL type --- backend/dataall/api/gql/graphql_union_type.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/backend/dataall/api/gql/graphql_union_type.py b/backend/dataall/api/gql/graphql_union_type.py index a8fea7e2f..b67fe3c9a 100644 --- a/backend/dataall/api/gql/graphql_union_type.py +++ b/backend/dataall/api/gql/graphql_union_type.py @@ -1,19 +1,31 @@ +from abc import ABC + from ._cache import cache_instances from .utils import get_named_type +class UnionTypeRegistry(ABC): + """An abstract class that is used to provide union type in runtime""" + + @classmethod + def types(cls): + raise NotImplementedError("Types method is not implemented") + + @cache_instances class Union: _register = {} - def __init__(self, name, types=[], resolver=lambda *_, **__: None): + def __init__(self, name, types=[], type_registry=None, resolver=lambda *_, **__: None): self.name = name self.types = types + self.type_registry = type_registry self.resolver = resolver Union._register[name] = self def gql(self, *args, **kwargs): - return f"union {self.name} = {'|'.join([get_named_type(t).name for t in self.types])}" + types = self.type_registry.types() if self.type_registry else self.types + return f"union {self.name} = {'|'.join([get_named_type(t).name for t in types])}" if __name__ == '__main__': From 3daf2aab7da67320bab4474de2119a93c46840f8 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 11:42:51 +0200 Subject: [PATCH 25/67] Changed Feed type registration mechanism Moved FeedRegistry to gql since it's more appropriate place for this Started using registry to provide types Renaming and small fixes --- .../feed/services => api/Objects/Feed}/registry.py | 14 ++++++++++---- backend/dataall/api/Objects/Feed/resolvers.py | 6 +++--- backend/dataall/api/Objects/Feed/schema.py | 11 ++--------- backend/dataall/core/feed/__init__.py | 1 - backend/dataall/core/feed/services/__init__.py | 1 - backend/dataall/modules/datasets/__init__.py | 2 +- tests/modules/datasets/test_dataset_feed.py | 6 +++--- 7 files changed, 19 insertions(+), 22 deletions(-) rename backend/dataall/{core/feed/services => api/Objects/Feed}/registry.py (72%) delete mode 100644 backend/dataall/core/feed/__init__.py delete mode 100644 backend/dataall/core/feed/services/__init__.py diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/api/Objects/Feed/registry.py similarity index 72% rename from backend/dataall/core/feed/services/registry.py rename to backend/dataall/api/Objects/Feed/registry.py index 893f37f55..a119529ab 100644 --- a/backend/dataall/core/feed/services/registry.py +++ b/backend/dataall/api/Objects/Feed/registry.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Type, Dict +from dataall.api import gql +from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models @@ -10,7 +12,7 @@ class FeedDefinition: model: Type[Resource] -class FeedRegistry: +class FeedRegistry(UnionTypeRegistry): """Registers models for different target types""" _DEFINITIONS: Dict[str, FeedDefinition] = {} @@ -19,16 +21,20 @@ def register(cls, definition: FeedDefinition): cls._DEFINITIONS[definition.target_type] = definition @classmethod - def find(cls, target_type: str): - return cls._DEFINITIONS[target_type] + def find_model(cls, target_type: str): + return cls._DEFINITIONS[target_type].model @classmethod - def find_by_model(cls, obj: Resource): + def find_target(cls, obj: Resource): for target_type, definition in cls._DEFINITIONS.items(): if isinstance(obj, definition.model): return target_type return None + @classmethod + def types(cls): + return [gql.Ref(target_type) for target_type in cls._DEFINITIONS.keys()] + FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index 598ec86e1..08de0d6b7 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,7 +2,7 @@ from dataall.api.context import Context from dataall.db import paginate, models -from dataall.core.feed.services.registry import FeedRegistry +from dataall.api.Objects.Feed.registry import FeedRegistry class Feed: @@ -20,14 +20,14 @@ def targetType(self): def resolve_feed_target_type(obj, *_): - return FeedRegistry.find_by_model(obj) + return FeedRegistry.find_target(obj) def resolve_target(context: Context, source: Feed, **kwargs): if not source: return None with context.engine.scoped_session() as session: - model = FeedRegistry.find(source.targetType) + model = FeedRegistry.find_model(source.targetType) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/api/Objects/Feed/schema.py b/backend/dataall/api/Objects/Feed/schema.py index d58918716..42fea86ad 100644 --- a/backend/dataall/api/Objects/Feed/schema.py +++ b/backend/dataall/api/Objects/Feed/schema.py @@ -1,18 +1,11 @@ from ... import gql from .resolvers import * +from dataall.api.Objects.Feed.registry import FeedRegistry FeedTarget = gql.Union( name='FeedTarget', - types=[ - gql.Ref('Dataset'), - gql.Ref('DatasetTable'), - gql.Ref('DatasetTableColumn'), - gql.Ref('DatasetStorageLocation'), - gql.Ref('DataPipeline'), - gql.Ref('Worksheet'), - gql.Ref('Dashboard'), - ], + type_registry=FeedRegistry, resolver=resolve_feed_target_type, ) diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py deleted file mode 100644 index 39f751553..000000000 --- a/backend/dataall/core/feed/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains logic related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py deleted file mode 100644 index 5b130b24b..000000000 --- a/backend/dataall/core/feed/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Service layer of feeds""" diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 5b7224a48..6ba4a24e2 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,7 +2,7 @@ import logging from typing import List -from dataall.core.feed.services.registry import FeedRegistry, FeedDefinition +from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition from dataall.core.glossary.services.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode diff --git a/tests/modules/datasets/test_dataset_feed.py b/tests/modules/datasets/test_dataset_feed.py index 52c97e990..db5ff43e2 100644 --- a/tests/modules/datasets/test_dataset_feed.py +++ b/tests/modules/datasets/test_dataset_feed.py @@ -1,11 +1,11 @@ -from dataall.core.feed.services.registry import FeedRegistry +from dataall.api.Objects.Feed.registry import FeedRegistry from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def test_dataset_registered(): - model = FeedRegistry.find("DatasetTableColumn") + model = FeedRegistry.find_model("DatasetTableColumn") assert model == DatasetTableColumn model = DatasetTableColumn() - assert "DatasetTableColumn" == FeedRegistry.find_by_model(model) + assert "DatasetTableColumn" == FeedRegistry.find_target(model) From db3bfd3f51c0b7ea2ef39ab62f9176b99e5ebeb5 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 11:43:56 +0200 Subject: [PATCH 26/67] Added TODO for future refactoring Solve circular dependecy for redshift. It should go away after the migration of redshift --- backend/dataall/aws/handlers/redshift.py | 1 + backend/dataall/db/api/redshift_cluster.py | 7 +++++-- deploy/stacks/container.py | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/dataall/aws/handlers/redshift.py b/backend/dataall/aws/handlers/redshift.py index 4d2591520..c186d5df7 100644 --- a/backend/dataall/aws/handlers/redshift.py +++ b/backend/dataall/aws/handlers/redshift.py @@ -9,6 +9,7 @@ from .sts import SessionHelper from ... import db from ...db import models +# TODO should be migrated in the redshift module from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) diff --git a/backend/dataall/db/api/redshift_cluster.py b/backend/dataall/db/api/redshift_cluster.py index 4167a555a..8ca3088bf 100644 --- a/backend/dataall/db/api/redshift_cluster.py +++ b/backend/dataall/db/api/redshift_cluster.py @@ -9,7 +9,6 @@ NamingConventionPattern, ) from ...utils.slugify import slugify -from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -495,7 +494,11 @@ def enable_copy_table( session, username, groups, uri, data=None, check_perm=True ) -> models.RedshiftClusterDatasetTable: cluster = RedshiftCluster.get_redshift_cluster_by_uri(session, uri) - table = DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) + + # TODO should be migrated in the redshift module + table = dataall.modules.datasets.services.dataset_table.DatasetTableService.get_dataset_table_by_uri( + session, data['tableUri'] + ) table = models.RedshiftClusterDatasetTable( clusterUri=uri, datasetUri=data['datasetUri'], diff --git a/deploy/stacks/container.py b/deploy/stacks/container.py index d3c761519..aa7be04df 100644 --- a/deploy/stacks/container.py +++ b/deploy/stacks/container.py @@ -92,6 +92,7 @@ def __init__( envname, resource_prefix, vpc, vpc_endpoints_sg ) + # TODO introduce the ability to change the deployment depending on config.json file sync_tables_task = self.set_scheduled_task( cluster=cluster, command=['python3.8', '-m', 'dataall.modules.datasets.tasks.tables_syncer'], @@ -174,6 +175,7 @@ def __init__( update_bucket_policies_task.task.security_groups ) + # TODO introduce the ability to change the deployment depending on config.json file subscriptions_task = self.set_scheduled_task( cluster=cluster, command=[ From 13b6e92918b453c3fe6ad20be7ec2d8e74194315 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:11:50 +0200 Subject: [PATCH 27/67] Added GlossaryRegistry for Union scheme --- .../dataall/api/Objects/Glossary/__init__.py | 3 ++- .../Objects/Glossary}/registry.py | 10 ++++++- .../dataall/api/Objects/Glossary/resolvers.py | 27 ++++++++++--------- .../dataall/api/Objects/Glossary/schema.py | 9 ++----- backend/dataall/core/glossary/__init__.py | 1 - .../core/glossary/services/__init__.py | 1 - backend/dataall/db/api/glossary.py | 25 ++++++----------- backend/dataall/modules/datasets/__init__.py | 2 +- tests/api/test_glossary.py | 20 +++++++------- 9 files changed, 46 insertions(+), 52 deletions(-) rename backend/dataall/{core/glossary/services => api/Objects/Glossary}/registry.py (80%) delete mode 100644 backend/dataall/core/glossary/__init__.py delete mode 100644 backend/dataall/core/glossary/services/__init__.py diff --git a/backend/dataall/api/Objects/Glossary/__init__.py b/backend/dataall/api/Objects/Glossary/__init__.py index 0c4ec6166..30e86e17e 100644 --- a/backend/dataall/api/Objects/Glossary/__init__.py +++ b/backend/dataall/api/Objects/Glossary/__init__.py @@ -4,6 +4,7 @@ mutations, resolvers, schema, + registry, ) -__all__ = ['resolvers', 'schema', 'input_types', 'queries', 'mutations'] +__all__ = ['registry', 'resolvers', 'schema', 'input_types', 'queries', 'mutations'] diff --git a/backend/dataall/core/glossary/services/registry.py b/backend/dataall/api/Objects/Glossary/registry.py similarity index 80% rename from backend/dataall/core/glossary/services/registry.py rename to backend/dataall/api/Objects/Glossary/registry.py index ee3f10d41..375f470e2 100644 --- a/backend/dataall/core/glossary/services/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Type, Dict, Optional, Protocol, Union +from dataall.api import gql +from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models @@ -11,6 +13,7 @@ def uri(self): @dataclass class GlossaryDefinition: + """Glossary's definition used for registration references of other modules""" target_type: str object_type: str model: Union[Type[Resource], Identifiable] # should be an intersection, but python typing doesn't have one yet @@ -19,7 +22,8 @@ def target_uri(self): return self.model.uri() -class GlossaryRegistry: +class GlossaryRegistry(UnionTypeRegistry): + """Registry of glossary definition and API to retrieve data""" _DEFINITIONS: Dict[str, GlossaryDefinition] = {} @classmethod @@ -42,6 +46,10 @@ def find_object_type(cls, model: Resource) -> Optional[str]: def definitions(cls): return cls._DEFINITIONS.values() + @classmethod + def types(cls): + return [gql.Ref(definition.object_type) for definition in cls._DEFINITIONS.values()] + GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", models.DatasetStorageLocation)) diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index ae8501993..15e77327f 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -2,6 +2,7 @@ from sqlalchemy import and_, or_, asc +from dataall.api.Objects.Glossary.registry import GlossaryRegistry from .... import db from ....api.context import Context from ....db import paginate, exceptions, models @@ -12,8 +13,6 @@ GlossaryRole ) -from dataall.core.glossary.services.registry import GlossaryRegistry - def resolve_glossary_node(obj: models.GlossaryNode, *_): if obj.nodeType == 'G': @@ -273,8 +272,6 @@ def request_link( with context.engine.scoped_session() as session: return db.api.Glossary.link_term( session=session, - username=context.username, - groups=context.groups, uri=nodeUri, data={ 'targetUri': targetUri, @@ -282,7 +279,7 @@ def request_link( 'approvedByOwner': True, 'approvedBySteward': False, }, - check_perm=True, + target_model=_target_model(targetType), ) @@ -296,8 +293,6 @@ def link_term( with context.engine.scoped_session() as session: return db.api.Glossary.link_term( session=session, - username=context.username, - groups=context.groups, uri=nodeUri, data={ 'targetUri': targetUri, @@ -305,7 +300,7 @@ def link_term( 'approvedByOwner': True, 'approvedBySteward': True, }, - check_perm=True, + target_model=_target_model(targetType), ) @@ -329,7 +324,7 @@ def target_union_resolver(obj, *_): def resolve_link_target(context, source, **kwargs): with context.engine.scoped_session() as session: - model = GlossaryRegistry.find_model(source.targetUri) + model = GlossaryRegistry.find_model(source.targetType) target = session.query(model).get(source.targetUri) return target @@ -342,11 +337,8 @@ def resolve_term_associations( with context.engine.scoped_session() as session: return db.api.Glossary.list_term_associations( session=session, - username=context.username, - groups=context.groups, - uri=None, data={'source': source, 'filter': filter}, - check_perm=True, + target_model_definitions=GlossaryRegistry.definitions() ) @@ -477,3 +469,12 @@ def reindex(context, linkUri): upsert_folder(session=session, es=context.es, locationUri=link.targetUri) elif isinstance(target, models.Dashboard): upsert_dashboard(session=session, es=context.es, dashboardUri=link.targetUri) + + +def _target_model(target_type: str): + target_model = GlossaryRegistry.find_model(target_type) + if not target_model: + raise exceptions.InvalidInput( + 'NodeType', 'term.nodeType', 'association target type is invalid' + ) + return target_model diff --git a/backend/dataall/api/Objects/Glossary/schema.py b/backend/dataall/api/Objects/Glossary/schema.py index 36fd1b758..9b71ae4b1 100644 --- a/backend/dataall/api/Objects/Glossary/schema.py +++ b/backend/dataall/api/Objects/Glossary/schema.py @@ -1,6 +1,7 @@ from ... import gql from .resolvers import * from ...constants import GlossaryRole +from dataall.api.Objects.Glossary.registry import GlossaryRegistry GlossaryNode = gql.Union( name='GlossaryNode', @@ -246,13 +247,7 @@ GlossaryTermLinkTarget = gql.Union( name='GlossaryTermLinkTarget', - types=[ - gql.Ref('Dataset'), - gql.Ref('DatasetTable'), - gql.Ref('DatasetStorageLocation'), - gql.Ref('DatasetTableColumn'), - gql.Ref('Dashboard'), - ], + type_registry=GlossaryRegistry, resolver=target_union_resolver, ) diff --git a/backend/dataall/core/glossary/__init__.py b/backend/dataall/core/glossary/__init__.py deleted file mode 100644 index aa81c1e26..000000000 --- a/backend/dataall/core/glossary/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains code related to glossaries""" diff --git a/backend/dataall/core/glossary/services/__init__.py b/backend/dataall/core/glossary/services/__init__.py deleted file mode 100644 index 9ed65d261..000000000 --- a/backend/dataall/core/glossary/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Service layer of glossaries""" diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index 96c340f62..2dc97c62c 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -5,12 +5,11 @@ from sqlalchemy.orm import with_expression, aliased from .. import models, exceptions, permissions, paginate, Resource -from .permission_checker import ( - has_tenant_perm, -) +from .permission_checker import has_tenant_perm from ..models.Glossary import GlossaryNodeStatus -from dataall.core.glossary.services.registry import GlossaryRegistry from ..paginator import Page +from dataall.core.permission_checker import has_tenant_permission +from dataall.core.context import get_context logger = logging.getLogger(__name__) @@ -113,8 +112,8 @@ def update_node(session, username, groups, uri, data=None, check_perm=None): return node @staticmethod - @has_tenant_perm(permissions.MANAGE_GLOSSARIES) - def link_term(session, username, groups, uri, data=None, check_perm=None): + @has_tenant_permission(permissions.MANAGE_GLOSSARIES) + def link_term(session, uri, target_model: Resource, data): term: models.GlossaryNode = session.query(models.GlossaryNode).get(uri) if not term: raise exceptions.ObjectNotFound('Node', uri) @@ -128,18 +127,12 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): target_uri: str = data['targetUri'] target_type: str = data['targetType'] - target_model: Resource = GlossaryRegistry.find_model(target_type) - if not target_model: - raise exceptions.InvalidInput( - 'NodeType', 'term.nodeType', 'association target type is invalid' - ) - target = session.query(target_model).get(target_uri) if not target: raise exceptions.ObjectNotFound('Association target', uri) link = models.TermLink( - owner=username, + owner=get_context().username, approvedByOwner=data.get('approvedByOwner', True), approvedBySteward=data.get('approvedBySteward', True), nodeUri=uri, @@ -335,14 +328,12 @@ def list_node_children(session, source, filter): ).to_dict() @staticmethod - def list_term_associations( - session, username, groups, uri, data=None, check_perm=None - ): + def list_term_associations(session, target_model_definitions, data=None): source = data['source'] filter = data['filter'] query = None - for definition in GlossaryRegistry.definitions(): + for definition in target_model_definitions: model = definition.model subquery = session.query( definition.target_uri().label('targetUri'), diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 6ba4a24e2..de1963bd2 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -3,7 +3,7 @@ from typing import List from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition -from dataall.core.glossary.services.registry import GlossaryRegistry, GlossaryDefinition +from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode diff --git a/tests/api/test_glossary.py b/tests/api/test_glossary.py index 8276dca8c..987ccc1a8 100644 --- a/tests/api/test_glossary.py +++ b/tests/api/test_glossary.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List from dataall.db import models from dataall.modules.datasets.db.table_column_model import DatasetTableColumn @@ -197,7 +198,6 @@ def test_list_glossaries(client): } """ ) - print(response) assert response.data.listGlossaries.count == 1 assert response.data.listGlossaries.nodes[0].stats.categories == 2 @@ -246,7 +246,6 @@ def test_hierarchical_search(client): } """ ) - print(response) assert response.data.searchGlossary.count == 4 @@ -263,7 +262,6 @@ def test_get_glossary(client, g1): """, nodeUri=g1.nodeUri, ) - print(r) assert r.data.getGlossary.nodeUri == g1.nodeUri assert r.data.getGlossary.label == g1.label assert r.data.getGlossary.readme == g1.readme @@ -301,7 +299,6 @@ def test_get_term(client, t1): """, nodeUri=t1.nodeUri, ) - print(r) assert r.data.getTerm.nodeUri == t1.nodeUri assert r.data.getTerm.label == t1.label assert r.data.getTerm.readme == t1.readme @@ -552,7 +549,7 @@ def test_link_term(client, t1, _columns, group): print(r) -def test_get_term_associations(t1, client): +def test_get_term_associations(t1, db, client): r = client.query( """ query GetTerm($nodeUri:String!){ @@ -579,10 +576,13 @@ def test_get_term_associations(t1, client): nodeUri=t1.nodeUri, username='alice', ) - print(r) + assert r.data.getTerm.nodeUri == t1.nodeUri + assert r.data.getTerm.label == t1.label + assert r.data.getTerm.readme == t1.readme -def test_delete_category(client, c1, group): +def test_delete_category(client, db, c1, group): + now = datetime.now() r = client.query( """ mutation DeleteCategory( @@ -597,7 +597,9 @@ def test_delete_category(client, c1, group): username='alice', groups=[group.name], ) - print(r) + with db.scoped_session() as session: + node = session.query(models.GlossaryNode).get(c1.nodeUri) + assert node.deleted >= now def test_list_glossaries_after_delete(client): @@ -634,7 +636,6 @@ def test_list_glossaries_after_delete(client): } """ ) - print(response) assert response.data.listGlossaries.count == 1 assert response.data.listGlossaries.nodes[0].stats.categories == 0 @@ -683,5 +684,4 @@ def test_hierarchical_search_after_delete(client): } """ ) - print(response) assert response.data.searchGlossary.count == 1 From 144dfea5a1026d665eda7e6384adc5781297c5ba Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:15:08 +0200 Subject: [PATCH 28/67] Changed import in redshift module --- backend/dataall/db/api/redshift_cluster.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/dataall/db/api/redshift_cluster.py b/backend/dataall/db/api/redshift_cluster.py index 8ca3088bf..31b795225 100644 --- a/backend/dataall/db/api/redshift_cluster.py +++ b/backend/dataall/db/api/redshift_cluster.py @@ -495,8 +495,9 @@ def enable_copy_table( ) -> models.RedshiftClusterDatasetTable: cluster = RedshiftCluster.get_redshift_cluster_by_uri(session, uri) - # TODO should be migrated in the redshift module - table = dataall.modules.datasets.services.dataset_table.DatasetTableService.get_dataset_table_by_uri( + # TODO this dirty hack should be removed in the redshift module or after pipeline migration (circular import) + from dataall.modules.datasets.services.dataset_table import DatasetTableService + table = DatasetTableService.get_dataset_table_by_uri( session, data['tableUri'] ) table = models.RedshiftClusterDatasetTable( From d43b9b31b661287e303cfad8fc958c7f511277d2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:18:04 +0200 Subject: [PATCH 29/67] No need for Utils yet --- backend/dataall/core/utils/__init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 backend/dataall/core/utils/__init__.py diff --git a/backend/dataall/core/utils/__init__.py b/backend/dataall/core/utils/__init__.py deleted file mode 100644 index 02ed9cfb4..000000000 --- a/backend/dataall/core/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Utility functions and classes""" From 39b244c9b2fc841405e53088a48f8c564850b505 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:22:39 +0200 Subject: [PATCH 30/67] Fixed linting --- backend/dataall/db/models/Dataset.py | 1 - .../modules/datasets/handlers/__init__.py | 1 - .../datasets/handlers/glue_column_handler.py | 19 +++++++++---------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/backend/dataall/db/models/Dataset.py b/backend/dataall/db/models/Dataset.py index 451c7da7c..fd65387b7 100644 --- a/backend/dataall/db/models/Dataset.py +++ b/backend/dataall/db/models/Dataset.py @@ -62,4 +62,3 @@ class Dataset(Resource, Base): def uri(self): return self.datasetUri - diff --git a/backend/dataall/modules/datasets/handlers/__init__.py b/backend/dataall/modules/datasets/handlers/__init__.py index 19bd47297..a5d506712 100644 --- a/backend/dataall/modules/datasets/handlers/__init__.py +++ b/backend/dataall/modules/datasets/handlers/__init__.py @@ -8,4 +8,3 @@ ) __all__ = ["glue_column_handler", "glue_table_handler"] - diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index 02003eea2..329b702b7 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -67,16 +67,15 @@ def update_table_columns(engine, task: models.Task): updated_table = { k: v for k, v in original_table['Table'].items() - if k - not in [ - 'CatalogId', - 'VersionId', - 'DatabaseName', - 'CreateTime', - 'UpdateTime', - 'CreatedBy', - 'IsRegisteredWithLakeFormation', - ] + if k not in [ + 'CatalogId', + 'VersionId', + 'DatabaseName', + 'CreateTime', + 'UpdateTime', + 'CreatedBy', + 'IsRegisteredWithLakeFormation', + ] } all_columns = updated_table.get('StorageDescriptor', {}).get( 'Columns', [] From cb3800a8aae649c8ff14d937440ea215e106db09 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 14 Apr 2023 15:24:26 +0200 Subject: [PATCH 31/67] Datasets refactoring Moving datasets profiling to datasets modules --- backend/dataall/api/Objects/__init__.py | 1 - .../datasets/api}/DatasetProfiling/__init__.py | 0 .../datasets/api}/DatasetProfiling/input_types.py | 2 +- .../datasets/api}/DatasetProfiling/mutations.py | 2 +- .../datasets/api}/DatasetProfiling/queries.py | 2 +- .../datasets/api}/DatasetProfiling/resolvers.py | 10 +++++----- .../datasets/api}/DatasetProfiling/schema.py | 2 +- backend/dataall/modules/datasets/api/__init__.py | 5 +++-- 8 files changed, 12 insertions(+), 12 deletions(-) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetProfiling/__init__.py (100%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetProfiling/input_types.py (95%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetProfiling/mutations.py (95%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetProfiling/queries.py (97%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetProfiling/resolvers.py (95%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetProfiling/schema.py (98%) diff --git a/backend/dataall/api/Objects/__init__.py b/backend/dataall/api/Objects/__init__.py index 43d5e0833..80b91358a 100644 --- a/backend/dataall/api/Objects/__init__.py +++ b/backend/dataall/api/Objects/__init__.py @@ -29,7 +29,6 @@ Test, SagemakerStudio, RedshiftCluster, - DatasetProfiling, Glossary, AthenaQueryResult, Worksheet, diff --git a/backend/dataall/api/Objects/DatasetProfiling/__init__.py b/backend/dataall/modules/datasets/api/DatasetProfiling/__init__.py similarity index 100% rename from backend/dataall/api/Objects/DatasetProfiling/__init__.py rename to backend/dataall/modules/datasets/api/DatasetProfiling/__init__.py diff --git a/backend/dataall/api/Objects/DatasetProfiling/input_types.py b/backend/dataall/modules/datasets/api/DatasetProfiling/input_types.py similarity index 95% rename from backend/dataall/api/Objects/DatasetProfiling/input_types.py rename to backend/dataall/modules/datasets/api/DatasetProfiling/input_types.py index deb1739c5..e8e89fb16 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/input_types.py +++ b/backend/dataall/modules/datasets/api/DatasetProfiling/input_types.py @@ -1,4 +1,4 @@ -from ... import gql +from dataall.api import gql StartDatasetProfilingRunInput = gql.InputType( name='StartDatasetProfilingRunInput', diff --git a/backend/dataall/api/Objects/DatasetProfiling/mutations.py b/backend/dataall/modules/datasets/api/DatasetProfiling/mutations.py similarity index 95% rename from backend/dataall/api/Objects/DatasetProfiling/mutations.py rename to backend/dataall/modules/datasets/api/DatasetProfiling/mutations.py index 5876c81a7..778526048 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/mutations.py +++ b/backend/dataall/modules/datasets/api/DatasetProfiling/mutations.py @@ -1,4 +1,4 @@ -from ... import gql +from dataall.api import gql from .resolvers import * startDatasetProfilingRun = gql.MutationField( diff --git a/backend/dataall/api/Objects/DatasetProfiling/queries.py b/backend/dataall/modules/datasets/api/DatasetProfiling/queries.py similarity index 97% rename from backend/dataall/api/Objects/DatasetProfiling/queries.py rename to backend/dataall/modules/datasets/api/DatasetProfiling/queries.py index 9ab3eb2bb..2225e2117 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/queries.py +++ b/backend/dataall/modules/datasets/api/DatasetProfiling/queries.py @@ -1,4 +1,4 @@ -from ... import gql +from dataall.api import gql from .resolvers import * diff --git a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py b/backend/dataall/modules/datasets/api/DatasetProfiling/resolvers.py similarity index 95% rename from backend/dataall/api/Objects/DatasetProfiling/resolvers.py rename to backend/dataall/modules/datasets/api/DatasetProfiling/resolvers.py index 4b4684019..a84d0d82f 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py +++ b/backend/dataall/modules/datasets/api/DatasetProfiling/resolvers.py @@ -1,11 +1,11 @@ import json import logging -from ....api.context import Context -from ....aws.handlers.service_handlers import Worker -from ....aws.handlers.sts import SessionHelper -from ....db import api, permissions, models -from ....db.api import ResourcePolicy +from dataall.api.context import Context +from dataall.aws.handlers.service_handlers import Worker +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import api, permissions, models +from dataall.db.api import ResourcePolicy from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) diff --git a/backend/dataall/api/Objects/DatasetProfiling/schema.py b/backend/dataall/modules/datasets/api/DatasetProfiling/schema.py similarity index 98% rename from backend/dataall/api/Objects/DatasetProfiling/schema.py rename to backend/dataall/modules/datasets/api/DatasetProfiling/schema.py index f6fe9c575..f79022a51 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/schema.py +++ b/backend/dataall/modules/datasets/api/DatasetProfiling/schema.py @@ -1,4 +1,4 @@ -from ... import gql +from dataall.api import gql from .resolvers import ( resolve_dataset, get_profiling_run_status, diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index 538df0734..00d4dd3b8 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -1,6 +1,7 @@ """The GraphQL schema of datasets and related functionality""" from dataall.modules.datasets.api import ( - table_column + table_column, + DatasetProfiling ) -__all__ = ["table_column"] +__all__ = ["table_column", "DatasetProfiling"] From dd8e597a2e0a301342f0fa4b406b9a37f9e445a2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 14 Apr 2023 15:28:55 +0200 Subject: [PATCH 32/67] Datasets refactoring Renaming profiling --- backend/dataall/modules/datasets/api/__init__.py | 4 ++-- .../api/{DatasetProfiling => profiling}/__init__.py | 2 +- .../api/{DatasetProfiling => profiling}/input_types.py | 0 .../api/{DatasetProfiling => profiling}/mutations.py | 5 ++++- .../api/{DatasetProfiling => profiling}/queries.py | 7 ++++++- .../api/{DatasetProfiling => profiling}/resolvers.py | 0 .../datasets/api/{DatasetProfiling => profiling}/schema.py | 2 +- tests/api/test_dataset_profiling.py | 4 ++-- 8 files changed, 16 insertions(+), 8 deletions(-) rename backend/dataall/modules/datasets/api/{DatasetProfiling => profiling}/__init__.py (73%) rename backend/dataall/modules/datasets/api/{DatasetProfiling => profiling}/input_types.py (100%) rename backend/dataall/modules/datasets/api/{DatasetProfiling => profiling}/mutations.py (83%) rename backend/dataall/modules/datasets/api/{DatasetProfiling => profiling}/queries.py (86%) rename backend/dataall/modules/datasets/api/{DatasetProfiling => profiling}/resolvers.py (100%) rename backend/dataall/modules/datasets/api/{DatasetProfiling => profiling}/schema.py (96%) diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index 00d4dd3b8..6bb6f8ab0 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -1,7 +1,7 @@ """The GraphQL schema of datasets and related functionality""" from dataall.modules.datasets.api import ( table_column, - DatasetProfiling + profiling ) -__all__ = ["table_column", "DatasetProfiling"] +__all__ = ["table_column", "profiling"] diff --git a/backend/dataall/modules/datasets/api/DatasetProfiling/__init__.py b/backend/dataall/modules/datasets/api/profiling/__init__.py similarity index 73% rename from backend/dataall/modules/datasets/api/DatasetProfiling/__init__.py rename to backend/dataall/modules/datasets/api/profiling/__init__.py index dfa46b264..4c5b6c491 100644 --- a/backend/dataall/modules/datasets/api/DatasetProfiling/__init__.py +++ b/backend/dataall/modules/datasets/api/profiling/__init__.py @@ -1,4 +1,4 @@ -from . import ( +from dataall.modules.datasets.api.profiling import ( input_types, mutations, queries, diff --git a/backend/dataall/modules/datasets/api/DatasetProfiling/input_types.py b/backend/dataall/modules/datasets/api/profiling/input_types.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetProfiling/input_types.py rename to backend/dataall/modules/datasets/api/profiling/input_types.py diff --git a/backend/dataall/modules/datasets/api/DatasetProfiling/mutations.py b/backend/dataall/modules/datasets/api/profiling/mutations.py similarity index 83% rename from backend/dataall/modules/datasets/api/DatasetProfiling/mutations.py rename to backend/dataall/modules/datasets/api/profiling/mutations.py index 778526048..e4bcd62cc 100644 --- a/backend/dataall/modules/datasets/api/DatasetProfiling/mutations.py +++ b/backend/dataall/modules/datasets/api/profiling/mutations.py @@ -1,5 +1,8 @@ from dataall.api import gql -from .resolvers import * +from dataall.modules.datasets.api.profiling.resolvers import ( + start_profiling_run, + update_profiling_run_results +) startDatasetProfilingRun = gql.MutationField( name='startDatasetProfilingRun', diff --git a/backend/dataall/modules/datasets/api/DatasetProfiling/queries.py b/backend/dataall/modules/datasets/api/profiling/queries.py similarity index 86% rename from backend/dataall/modules/datasets/api/DatasetProfiling/queries.py rename to backend/dataall/modules/datasets/api/profiling/queries.py index 2225e2117..8d2fbb25c 100644 --- a/backend/dataall/modules/datasets/api/DatasetProfiling/queries.py +++ b/backend/dataall/modules/datasets/api/profiling/queries.py @@ -1,5 +1,10 @@ from dataall.api import gql -from .resolvers import * +from dataall.modules.datasets.api.profiling.resolvers import ( + get_profiling_run, + list_profiling_runs, + list_table_profiling_runs, + get_last_table_profiling_run +) getDatasetProfilingRun = gql.QueryField( diff --git a/backend/dataall/modules/datasets/api/DatasetProfiling/resolvers.py b/backend/dataall/modules/datasets/api/profiling/resolvers.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetProfiling/resolvers.py rename to backend/dataall/modules/datasets/api/profiling/resolvers.py diff --git a/backend/dataall/modules/datasets/api/DatasetProfiling/schema.py b/backend/dataall/modules/datasets/api/profiling/schema.py similarity index 96% rename from backend/dataall/modules/datasets/api/DatasetProfiling/schema.py rename to backend/dataall/modules/datasets/api/profiling/schema.py index f79022a51..6babb61b3 100644 --- a/backend/dataall/modules/datasets/api/DatasetProfiling/schema.py +++ b/backend/dataall/modules/datasets/api/profiling/schema.py @@ -1,5 +1,5 @@ from dataall.api import gql -from .resolvers import ( +from dataall.modules.datasets.api.profiling.resolvers import ( resolve_dataset, get_profiling_run_status, get_profiling_results, diff --git a/tests/api/test_dataset_profiling.py b/tests/api/test_dataset_profiling.py index c5bed6d1e..ad410a610 100644 --- a/tests/api/test_dataset_profiling.py +++ b/tests/api/test_dataset_profiling.py @@ -129,7 +129,7 @@ def test_get_table_profiling_run( client, dataset1, env1, module_mocker, table, db, group ): module_mocker.patch( - 'dataall.api.Objects.DatasetProfiling.resolvers.get_profiling_results_from_s3', + 'dataall.modules.datasets.api.profiling.resolvers.get_profiling_results_from_s3', return_value='{"results": "yes"}', ) runs = list_profiling_runs(client, dataset1, group) @@ -169,7 +169,7 @@ def test_list_table_profiling_runs( client, dataset1, env1, module_mocker, table, db, group ): module_mocker.patch( - 'dataall.api.Objects.DatasetProfiling.resolvers.get_profiling_results_from_s3', + 'dataall.modules.datasets.api.profiling.resolvers.get_profiling_results_from_s3', return_value='{"results": "yes"}', ) module_mocker.patch('requests.post', return_value=True) From 8ca7bea0feec8594ce6ca43f5bdadfe8951d0406 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 14 Apr 2023 15:32:50 +0200 Subject: [PATCH 33/67] Datasets refactoring Renaming table_column_model to models to easier import other models --- backend/dataall/modules/datasets/__init__.py | 2 +- backend/dataall/modules/datasets/api/table_column/resolvers.py | 2 +- .../modules/datasets/db/{table_column_model.py => models.py} | 0 .../dataall/modules/datasets/handlers/glue_column_handler.py | 2 +- backend/dataall/modules/datasets/services/dataset_table.py | 2 +- tests/api/test_dataset_table.py | 2 +- tests/api/test_glossary.py | 2 +- tests/modules/datasets/test_dataset_feed.py | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename backend/dataall/modules/datasets/db/{table_column_model.py => models.py} (100%) diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index de1963bd2..4620495fe 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -4,7 +4,7 @@ from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode log = logging.getLogger(__name__) diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index b958f2f7a..8e78a042e 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -6,7 +6,7 @@ from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy from dataall.modules.datasets.services.dataset_table import DatasetTableService -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn def list_table_columns( diff --git a/backend/dataall/modules/datasets/db/table_column_model.py b/backend/dataall/modules/datasets/db/models.py similarity index 100% rename from backend/dataall/modules/datasets/db/table_column_model.py rename to backend/dataall/modules/datasets/db/models.py diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index 329b702b7..df43f9dbd 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -5,7 +5,7 @@ from dataall.aws.handlers.sts import SessionHelper from dataall.db import models from dataall.aws.handlers.service_handlers import Worker -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) diff --git a/backend/dataall/modules/datasets/services/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py index 873cbe01e..cd02eadf5 100644 --- a/backend/dataall/modules/datasets/services/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -6,7 +6,7 @@ from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment from dataall.db.models import Dataset from dataall.utils import json_utils -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn logger = logging.getLogger(__name__) diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index a2fcb2add..88140b68c 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -4,7 +4,7 @@ import dataall from dataall.modules.datasets.services.dataset_table import DatasetTableService -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn @pytest.fixture(scope='module', autouse=True) diff --git a/tests/api/test_glossary.py b/tests/api/test_glossary.py index 987ccc1a8..bb7f34516 100644 --- a/tests/api/test_glossary.py +++ b/tests/api/test_glossary.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List from dataall.db import models -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn import pytest diff --git a/tests/modules/datasets/test_dataset_feed.py b/tests/modules/datasets/test_dataset_feed.py index db5ff43e2..06ffdc8ed 100644 --- a/tests/modules/datasets/test_dataset_feed.py +++ b/tests/modules/datasets/test_dataset_feed.py @@ -1,6 +1,6 @@ from dataall.api.Objects.Feed.registry import FeedRegistry -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn def test_dataset_registered(): From e36ab3b0df34c9fa9c86de3f755dca6b547faa73 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 14 Apr 2023 15:42:42 +0200 Subject: [PATCH 34/67] Datasets refactoring Moving DatasetProfilingRun model --- backend/dataall/aws/handlers/glue.py | 5 +- .../dataall/db/api/dataset_profiling_run.py | 47 ++++++++++--------- .../dataall/db/models/DatasetProfilingRun.py | 20 -------- backend/dataall/db/models/__init__.py | 1 - .../datasets/api/profiling/resolvers.py | 9 ++-- backend/dataall/modules/datasets/db/models.py | 21 +++++++-- tests/api/test_dataset_profiling.py | 5 +- 7 files changed, 53 insertions(+), 55 deletions(-) delete mode 100644 backend/dataall/db/models/DatasetProfilingRun.py diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index e05ce4c54..a61afc3a1 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,6 +6,7 @@ from .sts import SessionHelper from ... import db from ...db import models +from dataall.modules.datasets.db.models import DatasetProfilingRun log = logging.getLogger('aws:glue') @@ -526,7 +527,7 @@ def get_job_runs(engine, task: models.Task): @Worker.handler('glue.job.start_profiling_run') def start_profiling_run(engine, task: models.Task): with engine.scoped_session() as session: - profiling: models.DatasetProfilingRun = ( + profiling: DatasetProfilingRun = ( db.api.DatasetProfilingRun.get_profiling_run( session, profilingRunUri=task.targetUri ) @@ -572,7 +573,7 @@ def run_job(**data): @Worker.handler('glue.job.profiling_run_status') def get_profiling_run(engine, task: models.Task): with engine.scoped_session() as session: - profiling: models.DatasetProfilingRun = ( + profiling: DatasetProfilingRun = ( db.api.DatasetProfilingRun.get_profiling_run( session, profilingRunUri=task.targetUri ) diff --git a/backend/dataall/db/api/dataset_profiling_run.py b/backend/dataall/db/api/dataset_profiling_run.py index f1552bc81..5e7024843 100644 --- a/backend/dataall/db/api/dataset_profiling_run.py +++ b/backend/dataall/db/api/dataset_profiling_run.py @@ -2,6 +2,7 @@ from .. import paginate, models from ..exceptions import ObjectNotFound +from dataall.modules.datasets.db.models import DatasetProfilingRun as DatasetProfilingRunModel class DatasetProfilingRun: @@ -30,7 +31,7 @@ def start_profiling( if not environment: raise ObjectNotFound('Environment', dataset.environmentUri) - run = models.DatasetProfilingRun( + run = DatasetProfilingRunModel( datasetUri=dataset.datasetUri, status='RUNNING', AwsAccountId=environment.AwsAccountId, @@ -72,14 +73,14 @@ def get_profiling_run( session, profilingRunUri=None, GlueJobRunId=None, GlueTableName=None ): if profilingRunUri: - run: models.DatasetProfilingRun = session.query( - models.DatasetProfilingRun + run: DatasetProfilingRunModel = session.query( + DatasetProfilingRunModel ).get(profilingRunUri) else: - run: models.DatasetProfilingRun = ( - session.query(models.DatasetProfilingRun) - .filter(models.DatasetProfilingRun.GlueJobRunId == GlueJobRunId) - .filter(models.DatasetProfilingRun.GlueTableName == GlueTableName) + run: DatasetProfilingRunModel = ( + session.query(DatasetProfilingRunModel) + .filter(DatasetProfilingRunModel.GlueJobRunId == GlueJobRunId) + .filter(DatasetProfilingRunModel.GlueTableName == GlueTableName) .first() ) return run @@ -89,9 +90,9 @@ def list_profiling_runs(session, datasetUri, filter: dict = None): if not filter: filter = {} q = ( - session.query(models.DatasetProfilingRun) - .filter(models.DatasetProfilingRun.datasetUri == datasetUri) - .order_by(models.DatasetProfilingRun.created.desc()) + session.query(DatasetProfilingRunModel) + .filter(DatasetProfilingRunModel.datasetUri == datasetUri) + .order_by(DatasetProfilingRunModel.created.desc()) ) return paginate( q, page=filter.get('page', 1), page_size=filter.get('pageSize', 20) @@ -102,19 +103,19 @@ def list_table_profiling_runs(session, tableUri, filter): if not filter: filter = {} q = ( - session.query(models.DatasetProfilingRun) + session.query(DatasetProfilingRunModel) .join( models.DatasetTable, - models.DatasetTable.datasetUri == models.DatasetProfilingRun.datasetUri, + models.DatasetTable.datasetUri == DatasetProfilingRunModel.datasetUri, ) .filter( and_( models.DatasetTable.tableUri == tableUri, models.DatasetTable.GlueTableName - == models.DatasetProfilingRun.GlueTableName, + == DatasetProfilingRunModel.GlueTableName, ) ) - .order_by(models.DatasetProfilingRun.created.desc()) + .order_by(DatasetProfilingRunModel.created.desc()) ) return paginate( q, page=filter.get('page', 1), page_size=filter.get('pageSize', 20) @@ -123,34 +124,34 @@ def list_table_profiling_runs(session, tableUri, filter): @staticmethod def get_table_last_profiling_run(session, tableUri): return ( - session.query(models.DatasetProfilingRun) + session.query(DatasetProfilingRunModel) .join( models.DatasetTable, - models.DatasetTable.datasetUri == models.DatasetProfilingRun.datasetUri, + models.DatasetTable.datasetUri == DatasetProfilingRunModel.datasetUri, ) .filter(models.DatasetTable.tableUri == tableUri) .filter( models.DatasetTable.GlueTableName - == models.DatasetProfilingRun.GlueTableName + == DatasetProfilingRunModel.GlueTableName ) - .order_by(models.DatasetProfilingRun.created.desc()) + .order_by(DatasetProfilingRunModel.created.desc()) .first() ) @staticmethod def get_table_last_profiling_run_with_results(session, tableUri): return ( - session.query(models.DatasetProfilingRun) + session.query(DatasetProfilingRunModel) .join( models.DatasetTable, - models.DatasetTable.datasetUri == models.DatasetProfilingRun.datasetUri, + models.DatasetTable.datasetUri == DatasetProfilingRunModel.datasetUri, ) .filter(models.DatasetTable.tableUri == tableUri) .filter( models.DatasetTable.GlueTableName - == models.DatasetProfilingRun.GlueTableName + == DatasetProfilingRunModel.GlueTableName ) - .filter(models.DatasetProfilingRun.results.isnot(None)) - .order_by(models.DatasetProfilingRun.created.desc()) + .filter(DatasetProfilingRunModel.results.isnot(None)) + .order_by(DatasetProfilingRunModel.created.desc()) .first() ) diff --git a/backend/dataall/db/models/DatasetProfilingRun.py b/backend/dataall/db/models/DatasetProfilingRun.py deleted file mode 100644 index b4996db64..000000000 --- a/backend/dataall/db/models/DatasetProfilingRun.py +++ /dev/null @@ -1,20 +0,0 @@ -from sqlalchemy import Column, String -from sqlalchemy.dialects.postgresql import JSON - -from .. import Base, Resource, utils - - -class DatasetProfilingRun(Resource, Base): - __tablename__ = 'dataset_profiling_run' - profilingRunUri = Column( - String, primary_key=True, default=utils.uuid('profilingrun') - ) - datasetUri = Column(String, nullable=False) - GlueJobName = Column(String) - GlueJobRunId = Column(String) - GlueTriggerSchedule = Column(String) - GlueTriggerName = Column(String) - GlueTableName = Column(String) - AwsAccountId = Column(String) - results = Column(JSON, default={}) - status = Column(String, default='Created') diff --git a/backend/dataall/db/models/__init__.py b/backend/dataall/db/models/__init__.py index 1ab4134b3..f25e5f59b 100644 --- a/backend/dataall/db/models/__init__.py +++ b/backend/dataall/db/models/__init__.py @@ -5,7 +5,6 @@ from .DashboardShare import DashboardShare from .DashboardShare import DashboardShareStatus from .Dataset import Dataset -from .DatasetProfilingRun import DatasetProfilingRun from .DatasetQualityRule import DatasetQualityRule from .DatasetStorageLocation import DatasetStorageLocation from .DatasetTable import DatasetTable diff --git a/backend/dataall/modules/datasets/api/profiling/resolvers.py b/backend/dataall/modules/datasets/api/profiling/resolvers.py index a84d0d82f..05efbbab8 100644 --- a/backend/dataall/modules/datasets/api/profiling/resolvers.py +++ b/backend/dataall/modules/datasets/api/profiling/resolvers.py @@ -7,11 +7,12 @@ from dataall.db import api, permissions, models from dataall.db.api import ResourcePolicy from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.models import DatasetProfilingRun log = logging.getLogger(__name__) -def resolve_dataset(context, source: models.DatasetProfilingRun): +def resolve_dataset(context, source: DatasetProfilingRun): if not source: return None with context.engine.scoped_session() as session: @@ -49,7 +50,7 @@ def start_profiling_run(context: Context, source, input: dict = None): return run -def get_profiling_run_status(context: Context, source: models.DatasetProfilingRun): +def get_profiling_run_status(context: Context, source: DatasetProfilingRun): if not source: return None with context.engine.scoped_session() as session: @@ -61,7 +62,7 @@ def get_profiling_run_status(context: Context, source: models.DatasetProfilingRu return source.status -def get_profiling_results(context: Context, source: models.DatasetProfilingRun): +def get_profiling_results(context: Context, source: DatasetProfilingRun): if not source or source.results == {}: return None else: @@ -90,7 +91,7 @@ def get_profiling_run(context: Context, source, profilingRunUri=None): def get_last_table_profiling_run(context: Context, source, tableUri=None): with context.engine.scoped_session() as session: - run: models.DatasetProfilingRun = ( + run: DatasetProfilingRun = ( api.DatasetProfilingRun.get_table_last_profiling_run( session=session, tableUri=tableUri ) diff --git a/backend/dataall/modules/datasets/db/models.py b/backend/dataall/modules/datasets/db/models.py index 05bc26058..1ba60bea1 100644 --- a/backend/dataall/modules/datasets/db/models.py +++ b/backend/dataall/modules/datasets/db/models.py @@ -1,7 +1,6 @@ from sqlalchemy import Column, String - -from dataall.db import Base -from dataall.db import Resource, utils +from sqlalchemy.dialects.postgresql import JSON +from dataall.db import Base, Resource, utils class DatasetTableColumn(Resource, Base): @@ -21,3 +20,19 @@ class DatasetTableColumn(Resource, Base): def uri(self): return self.columnUri + + +class DatasetProfilingRun(Resource, Base): + __tablename__ = 'dataset_profiling_run' + profilingRunUri = Column( + String, primary_key=True, default=utils.uuid('profilingrun') + ) + datasetUri = Column(String, nullable=False) + GlueJobName = Column(String) + GlueJobRunId = Column(String) + GlueTriggerSchedule = Column(String) + GlueTriggerName = Column(String) + GlueTableName = Column(String) + AwsAccountId = Column(String) + results = Column(JSON, default={}) + status = Column(String, default='Created') diff --git a/tests/api/test_dataset_profiling.py b/tests/api/test_dataset_profiling.py index ad410a610..8d708e94d 100644 --- a/tests/api/test_dataset_profiling.py +++ b/tests/api/test_dataset_profiling.py @@ -2,6 +2,7 @@ import pytest import dataall +from dataall.modules.datasets.db.models import DatasetProfilingRun @pytest.fixture(scope='module', autouse=True) @@ -39,7 +40,7 @@ def test_add_tables(table, dataset1, db): def update_runs(db, runs): with db.scoped_session() as session: for run in runs: - run = session.query(dataall.db.models.DatasetProfilingRun).get( + run = session.query(DatasetProfilingRun).get( run['profilingRunUri'] ) run.status = 'SUCCEEDED' @@ -70,7 +71,7 @@ def test_start_profiling(org1, env1, dataset1, client, module_mocker, db, user, profiling = response.data.startDatasetProfilingRun assert profiling.profilingRunUri with db.scoped_session() as session: - profiling = session.query(dataall.db.models.DatasetProfilingRun).get( + profiling = session.query(DatasetProfilingRun).get( profiling.profilingRunUri ) profiling.GlueJobRunId = 'jr_111111111111' From 31720c254c3286e7a30c049e7c6835b5d4602c64 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 14 Apr 2023 15:52:44 +0200 Subject: [PATCH 35/67] Datasets refactoring Moving dataset profiling service and renaming it --- backend/dataall/aws/handlers/glue.py | 7 ++++--- backend/dataall/db/api/__init__.py | 1 - .../modules/datasets/api/profiling/resolvers.py | 15 ++++++++------- .../services/dataset_profiling_service.py} | 8 ++++---- .../datasets/tasks/subscription_service.py | 3 ++- 5 files changed, 18 insertions(+), 16 deletions(-) rename backend/dataall/{db/api/dataset_profiling_run.py => modules/datasets/services/dataset_profiling_service.py} (96%) diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index a61afc3a1..567ab6967 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -7,6 +7,7 @@ from ... import db from ...db import models from dataall.modules.datasets.db.models import DatasetProfilingRun +from dataall.modules.datasets.services.dataset_profiling_service import DatasetProfilingService log = logging.getLogger('aws:glue') @@ -528,7 +529,7 @@ def get_job_runs(engine, task: models.Task): def start_profiling_run(engine, task: models.Task): with engine.scoped_session() as session: profiling: DatasetProfilingRun = ( - db.api.DatasetProfilingRun.get_profiling_run( + DatasetProfilingService.get_profiling_run( session, profilingRunUri=task.targetUri ) ) @@ -547,7 +548,7 @@ def start_profiling_run(engine, task: models.Task): ), } ) - db.api.DatasetProfilingRun.update_run( + DatasetProfilingService.update_run( session, profilingRunUri=profiling.profilingRunUri, GlueJobRunId=run['JobRunId'], @@ -574,7 +575,7 @@ def run_job(**data): def get_profiling_run(engine, task: models.Task): with engine.scoped_session() as session: profiling: DatasetProfilingRun = ( - db.api.DatasetProfilingRun.get_profiling_run( + DatasetProfilingService.get_profiling_run( session, profilingRunUri=task.targetUri ) ) diff --git a/backend/dataall/db/api/__init__.py b/backend/dataall/db/api/__init__.py index 19138f7d7..a5f11d2c7 100644 --- a/backend/dataall/db/api/__init__.py +++ b/backend/dataall/db/api/__init__.py @@ -13,7 +13,6 @@ from .share_object import ShareObject, ShareObjectSM, ShareItemSM from .dataset import Dataset from .dataset_location import DatasetStorageLocation -from .dataset_profiling_run import DatasetProfilingRun from .notification import Notification from .redshift_cluster import RedshiftCluster from .vpc import Vpc diff --git a/backend/dataall/modules/datasets/api/profiling/resolvers.py b/backend/dataall/modules/datasets/api/profiling/resolvers.py index 05efbbab8..62ff64942 100644 --- a/backend/dataall/modules/datasets/api/profiling/resolvers.py +++ b/backend/dataall/modules/datasets/api/profiling/resolvers.py @@ -7,6 +7,7 @@ from dataall.db import api, permissions, models from dataall.db.api import ResourcePolicy from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.services.dataset_profiling_service import DatasetProfilingService from dataall.modules.datasets.db.models import DatasetProfilingRun log = logging.getLogger(__name__) @@ -33,7 +34,7 @@ def start_profiling_run(context: Context, source, input: dict = None): ) dataset = api.Dataset.get_dataset_by_uri(session, input['datasetUri']) - run = api.DatasetProfilingRun.start_profiling( + run = DatasetProfilingService.start_profiling( session=session, datasetUri=dataset.datasetUri, tableUri=input.get('tableUri'), @@ -71,7 +72,7 @@ def get_profiling_results(context: Context, source: DatasetProfilingRun): def update_profiling_run_results(context: Context, source, profilingRunUri, results): with context.engine.scoped_session() as session: - run = api.DatasetProfilingRun.update_run( + run = DatasetProfilingService.update_run( session=session, profilingRunUri=profilingRunUri, results=results ) return run @@ -79,12 +80,12 @@ def update_profiling_run_results(context: Context, source, profilingRunUri, resu def list_profiling_runs(context: Context, source, datasetUri=None): with context.engine.scoped_session() as session: - return api.DatasetProfilingRun.list_profiling_runs(session, datasetUri) + return DatasetProfilingService.list_profiling_runs(session, datasetUri) def get_profiling_run(context: Context, source, profilingRunUri=None): with context.engine.scoped_session() as session: - return api.DatasetProfilingRun.get_profiling_run( + return DatasetProfilingService.get_profiling_run( session=session, profilingRunUri=profilingRunUri ) @@ -92,7 +93,7 @@ def get_profiling_run(context: Context, source, profilingRunUri=None): def get_last_table_profiling_run(context: Context, source, tableUri=None): with context.engine.scoped_session() as session: run: DatasetProfilingRun = ( - api.DatasetProfilingRun.get_table_last_profiling_run( + DatasetProfilingService.get_table_last_profiling_run( session=session, tableUri=tableUri ) ) @@ -113,7 +114,7 @@ def get_last_table_profiling_run(context: Context, source, tableUri=None): if not run.results: run_with_results = ( - api.DatasetProfilingRun.get_table_last_profiling_run_with_results( + DatasetProfilingService.get_table_last_profiling_run_with_results( session=session, tableUri=tableUri ) ) @@ -144,6 +145,6 @@ def get_profiling_results_from_s3(environment, dataset, table, run): def list_table_profiling_runs(context: Context, source, tableUri=None): with context.engine.scoped_session() as session: - return api.DatasetProfilingRun.list_table_profiling_runs( + return DatasetProfilingService.list_table_profiling_runs( session=session, tableUri=tableUri, filter={} ) diff --git a/backend/dataall/db/api/dataset_profiling_run.py b/backend/dataall/modules/datasets/services/dataset_profiling_service.py similarity index 96% rename from backend/dataall/db/api/dataset_profiling_run.py rename to backend/dataall/modules/datasets/services/dataset_profiling_service.py index 5e7024843..915a52eda 100644 --- a/backend/dataall/db/api/dataset_profiling_run.py +++ b/backend/dataall/modules/datasets/services/dataset_profiling_service.py @@ -1,11 +1,11 @@ from sqlalchemy import and_ -from .. import paginate, models -from ..exceptions import ObjectNotFound +from dataall.db import paginate, models +from dataall.db.exceptions import ObjectNotFound from dataall.modules.datasets.db.models import DatasetProfilingRun as DatasetProfilingRunModel -class DatasetProfilingRun: +class DatasetProfilingService: def __init__(self): pass @@ -56,7 +56,7 @@ def update_run( GlueJobRunState=None, results=None, ): - run = DatasetProfilingRun.get_profiling_run( + run = DatasetProfilingService.get_profiling_run( session, profilingRunUri=profilingRunUri, GlueJobRunId=GlueJobRunId ) if GlueJobRunId: diff --git a/backend/dataall/modules/datasets/tasks/subscription_service.py b/backend/dataall/modules/datasets/tasks/subscription_service.py index 8674f903a..74f84d7c9 100644 --- a/backend/dataall/modules/datasets/tasks/subscription_service.py +++ b/backend/dataall/modules/datasets/tasks/subscription_service.py @@ -12,6 +12,7 @@ from dataall.aws.handlers.sqs import SqsQueue from dataall.db import get_engine from dataall.db import models +from dataall.modules.datasets.services.dataset_profiling_service import DatasetProfilingService from dataall.tasks.subscriptions import poll_queues from dataall.utils import json_utils from dataall.modules.datasets.services.dataset_table import DatasetTableService @@ -143,7 +144,7 @@ def store_dataquality_results(session, message): message.get('region'), ) - run = db.api.DatasetProfilingRun.start_profiling( + run = DatasetProfilingService.start_profiling( session=session, datasetUri=table.datasetUri, GlueTableName=table.GlueTableName, From 8a907df924211b345b94c2a2fc3c1f166ecd5fbf Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 14 Apr 2023 15:59:51 +0200 Subject: [PATCH 36/67] Datasets refactoring Extracted glue_profiling_handler --- backend/dataall/aws/handlers/glue.py | 86 ---------------- .../handlers/glue_profiling_handler.py | 98 +++++++++++++++++++ 2 files changed, 98 insertions(+), 86 deletions(-) create mode 100644 backend/dataall/modules/datasets/handlers/glue_profiling_handler.py diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index 567ab6967..e76fd4e63 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,8 +6,6 @@ from .sts import SessionHelper from ... import db from ...db import models -from dataall.modules.datasets.db.models import DatasetProfilingRun -from dataall.modules.datasets.services.dataset_profiling_service import DatasetProfilingService log = logging.getLogger('aws:glue') @@ -524,90 +522,6 @@ def get_job_runs(engine, task: models.Task): return [] return response['JobRuns'] - @staticmethod - @Worker.handler('glue.job.start_profiling_run') - def start_profiling_run(engine, task: models.Task): - with engine.scoped_session() as session: - profiling: DatasetProfilingRun = ( - DatasetProfilingService.get_profiling_run( - session, profilingRunUri=task.targetUri - ) - ) - dataset: models.Dataset = session.query(models.Dataset).get( - profiling.datasetUri - ) - run = Glue.run_job( - **{ - 'accountid': dataset.AwsAccountId, - 'name': dataset.GlueProfilingJobName, - 'region': dataset.region, - 'arguments': ( - {'--table': profiling.GlueTableName} - if profiling.GlueTableName - else {} - ), - } - ) - DatasetProfilingService.update_run( - session, - profilingRunUri=profiling.profilingRunUri, - GlueJobRunId=run['JobRunId'], - ) - return run - - @staticmethod - def run_job(**data): - accountid = data['accountid'] - name = data['name'] - try: - session = SessionHelper.remote_session(accountid=accountid) - client = session.client('glue', region_name=data.get('region', 'eu-west-1')) - response = client.start_job_run( - JobName=name, Arguments=data.get('arguments', {}) - ) - return response - except ClientError as e: - log.error(f'Failed to start profiling job {name} due to: {e}') - raise e - - @staticmethod - @Worker.handler('glue.job.profiling_run_status') - def get_profiling_run(engine, task: models.Task): - with engine.scoped_session() as session: - profiling: DatasetProfilingRun = ( - DatasetProfilingService.get_profiling_run( - session, profilingRunUri=task.targetUri - ) - ) - dataset: models.Dataset = session.query(models.Dataset).get( - profiling.datasetUri - ) - glue_run = Glue.get_job_run( - **{ - 'accountid': dataset.AwsAccountId, - 'name': dataset.GlueProfilingJobName, - 'region': dataset.region, - 'run_id': profiling.GlueJobRunId, - } - ) - profiling.status = glue_run['JobRun']['JobRunState'] - session.commit() - return profiling.status - - @staticmethod - def get_job_run(**data): - accountid = data['accountid'] - name = data['name'] - run_id = data['run_id'] - try: - session = SessionHelper.remote_session(accountid=accountid) - client = session.client('glue', region_name=data.get('region', 'eu-west-1')) - response = client.get_job_run(JobName=name, RunId=run_id) - return response - except ClientError as e: - log.error(f'Failed to get job run {run_id} due to: {e}') - raise e - @staticmethod def grant_principals_all_table_permissions( table: models.DatasetTable, principals: [str], client=None diff --git a/backend/dataall/modules/datasets/handlers/glue_profiling_handler.py b/backend/dataall/modules/datasets/handlers/glue_profiling_handler.py new file mode 100644 index 000000000..d15607733 --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/glue_profiling_handler.py @@ -0,0 +1,98 @@ +import logging +from botocore.exceptions import ClientError + +from dataall.aws.handlers.service_handlers import Worker +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import models +from dataall.modules.datasets.db.models import DatasetProfilingRun +from dataall.modules.datasets.services.dataset_profiling_service import DatasetProfilingService + +log = logging.getLogger(__name__) + + +class DatasetProfilingGlueHandler: + """A handler for dataset profiling""" + + @staticmethod + @Worker.handler('glue.job.profiling_run_status') + def get_profiling_run(engine, task: models.Task): + with engine.scoped_session() as session: + profiling: DatasetProfilingRun = ( + DatasetProfilingService.get_profiling_run( + session, profilingRunUri=task.targetUri + ) + ) + dataset: models.Dataset = session.query(models.Dataset).get( + profiling.datasetUri + ) + glue_run = DatasetProfilingGlueHandler.get_job_run( + **{ + 'accountid': dataset.AwsAccountId, + 'name': dataset.GlueProfilingJobName, + 'region': dataset.region, + 'run_id': profiling.GlueJobRunId, + } + ) + profiling.status = glue_run['JobRun']['JobRunState'] + session.commit() + return profiling.status + + @staticmethod + @Worker.handler('glue.job.start_profiling_run') + def start_profiling_run(engine, task: models.Task): + with engine.scoped_session() as session: + profiling: DatasetProfilingRun = ( + DatasetProfilingService.get_profiling_run( + session, profilingRunUri=task.targetUri + ) + ) + dataset: models.Dataset = session.query(models.Dataset).get( + profiling.datasetUri + ) + run = DatasetProfilingGlueHandler.run_job( + **{ + 'accountid': dataset.AwsAccountId, + 'name': dataset.GlueProfilingJobName, + 'region': dataset.region, + 'arguments': ( + {'--table': profiling.GlueTableName} + if profiling.GlueTableName + else {} + ), + } + ) + DatasetProfilingService.update_run( + session, + profilingRunUri=profiling.profilingRunUri, + GlueJobRunId=run['JobRunId'], + ) + return run + + @staticmethod + def get_job_run(**data): + accountid = data['accountid'] + name = data['name'] + run_id = data['run_id'] + try: + session = SessionHelper.remote_session(accountid=accountid) + client = session.client('glue', region_name=data.get('region', 'eu-west-1')) + response = client.get_job_run(JobName=name, RunId=run_id) + return response + except ClientError as e: + log.error(f'Failed to get job run {run_id} due to: {e}') + raise e + + @staticmethod + def run_job(**data): + accountid = data['accountid'] + name = data['name'] + try: + session = SessionHelper.remote_session(accountid=accountid) + client = session.client('glue', region_name=data.get('region', 'eu-west-1')) + response = client.start_job_run( + JobName=name, Arguments=data.get('arguments', {}) + ) + return response + except ClientError as e: + log.error(f'Failed to start profiling job {name} due to: {e}') + raise e From 561da72a94c1eefe4547eede1a03582b2ee14f79 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 14 Apr 2023 16:08:24 +0200 Subject: [PATCH 37/67] Datasets refactoring Deleted DatasetTableProfilingJob since could not find any usage of it --- .../db/models/DatasetTableProfilingJob.py | 18 ------------------ backend/dataall/db/models/__init__.py | 1 - 2 files changed, 19 deletions(-) delete mode 100644 backend/dataall/db/models/DatasetTableProfilingJob.py diff --git a/backend/dataall/db/models/DatasetTableProfilingJob.py b/backend/dataall/db/models/DatasetTableProfilingJob.py deleted file mode 100644 index ea0fedbf0..000000000 --- a/backend/dataall/db/models/DatasetTableProfilingJob.py +++ /dev/null @@ -1,18 +0,0 @@ -from sqlalchemy import Column, String -from sqlalchemy.orm import query_expression - -from .. import Base -from .. import Resource, utils - - -class DatasetTableProfilingJob(Resource, Base): - __tablename__ = 'dataset_table_profiling_job' - tableUri = Column(String, nullable=False) - jobUri = Column(String, primary_key=True, default=utils.uuid('profilingjob')) - AWSAccountId = Column(String, nullable=False) - RunCommandId = Column(String, nullable=True) - GlueDatabaseName = Column(String, nullable=False) - GlueTableName = Column(String, nullable=False) - region = Column(String, default='eu-west-1') - status = Column(String, default='') - userRoleForJob = query_expression() diff --git a/backend/dataall/db/models/__init__.py b/backend/dataall/db/models/__init__.py index f25e5f59b..0af480d79 100644 --- a/backend/dataall/db/models/__init__.py +++ b/backend/dataall/db/models/__init__.py @@ -8,7 +8,6 @@ from .DatasetQualityRule import DatasetQualityRule from .DatasetStorageLocation import DatasetStorageLocation from .DatasetTable import DatasetTable -from .DatasetTableProfilingJob import DatasetTableProfilingJob from .Environment import Environment from .EnvironmentGroup import EnvironmentGroup from .FeedMessage import FeedMessage From 73c8150a363b3eafc6715d658f97dc6449db2dbd Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 10:29:51 +0200 Subject: [PATCH 38/67] Datasets refactoring Moved dataset storage location into modules --- backend/dataall/api/Objects/__init__.py | 1 - .../api}/DatasetStorageLocation/__init__.py | 2 +- .../api}/DatasetStorageLocation/input_types.py | 2 +- .../api}/DatasetStorageLocation/mutations.py | 13 +++++++++---- .../datasets/api}/DatasetStorageLocation/queries.py | 4 ++-- .../api}/DatasetStorageLocation/resolvers.py | 12 ++++++------ .../datasets/api}/DatasetStorageLocation/schema.py | 7 +++++-- backend/dataall/modules/datasets/api/__init__.py | 5 +++-- 8 files changed, 27 insertions(+), 19 deletions(-) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetStorageLocation/__init__.py (69%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetStorageLocation/input_types.py (97%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetStorageLocation/mutations.py (76%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetStorageLocation/queries.py (66%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetStorageLocation/resolvers.py (94%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetStorageLocation/schema.py (95%) diff --git a/backend/dataall/api/Objects/__init__.py b/backend/dataall/api/Objects/__init__.py index 80b91358a..7c064fb1f 100644 --- a/backend/dataall/api/Objects/__init__.py +++ b/backend/dataall/api/Objects/__init__.py @@ -24,7 +24,6 @@ Dashboard, ShareObject, Organization, - DatasetStorageLocation, Stack, Test, SagemakerStudio, diff --git a/backend/dataall/api/Objects/DatasetStorageLocation/__init__.py b/backend/dataall/modules/datasets/api/DatasetStorageLocation/__init__.py similarity index 69% rename from backend/dataall/api/Objects/DatasetStorageLocation/__init__.py rename to backend/dataall/modules/datasets/api/DatasetStorageLocation/__init__.py index dfa46b264..e00ffe36f 100644 --- a/backend/dataall/api/Objects/DatasetStorageLocation/__init__.py +++ b/backend/dataall/modules/datasets/api/DatasetStorageLocation/__init__.py @@ -1,4 +1,4 @@ -from . import ( +from dataall.modules.datasets.api.DatasetStorageLocation import ( input_types, mutations, queries, diff --git a/backend/dataall/api/Objects/DatasetStorageLocation/input_types.py b/backend/dataall/modules/datasets/api/DatasetStorageLocation/input_types.py similarity index 97% rename from backend/dataall/api/Objects/DatasetStorageLocation/input_types.py rename to backend/dataall/modules/datasets/api/DatasetStorageLocation/input_types.py index f948bebad..4e4bf10e4 100644 --- a/backend/dataall/api/Objects/DatasetStorageLocation/input_types.py +++ b/backend/dataall/modules/datasets/api/DatasetStorageLocation/input_types.py @@ -1,4 +1,4 @@ -from ... import gql +from dataall.api import gql NewDatasetStorageLocationInput = gql.InputType( name='NewDatasetStorageLocationInput', diff --git a/backend/dataall/api/Objects/DatasetStorageLocation/mutations.py b/backend/dataall/modules/datasets/api/DatasetStorageLocation/mutations.py similarity index 76% rename from backend/dataall/api/Objects/DatasetStorageLocation/mutations.py rename to backend/dataall/modules/datasets/api/DatasetStorageLocation/mutations.py index 5b89cc6c1..10fc2ec40 100644 --- a/backend/dataall/api/Objects/DatasetStorageLocation/mutations.py +++ b/backend/dataall/modules/datasets/api/DatasetStorageLocation/mutations.py @@ -1,10 +1,15 @@ -from ... import gql -from .input_types import ( +from dataall.api import gql +from dataall.modules.datasets.api.DatasetStorageLocation.input_types import ( ModifyDatasetFolderInput, NewDatasetStorageLocationInput, ) -from .resolvers import * -from .schema import DatasetStorageLocation +from dataall.modules.datasets.api.DatasetStorageLocation.resolvers import ( + create_storage_location, + update_storage_location, + remove_storage_location, + publish_location_update +) +from dataall.modules.datasets.api.DatasetStorageLocation.schema import DatasetStorageLocation createDatasetStorageLocation = gql.MutationField( name='createDatasetStorageLocation', diff --git a/backend/dataall/api/Objects/DatasetStorageLocation/queries.py b/backend/dataall/modules/datasets/api/DatasetStorageLocation/queries.py similarity index 66% rename from backend/dataall/api/Objects/DatasetStorageLocation/queries.py rename to backend/dataall/modules/datasets/api/DatasetStorageLocation/queries.py index 1baa5a7f9..447225cfd 100644 --- a/backend/dataall/api/Objects/DatasetStorageLocation/queries.py +++ b/backend/dataall/modules/datasets/api/DatasetStorageLocation/queries.py @@ -1,5 +1,5 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetStorageLocation.resolvers import get_storage_location getDatasetStorageLocation = gql.QueryField( name='getDatasetStorageLocation', diff --git a/backend/dataall/api/Objects/DatasetStorageLocation/resolvers.py b/backend/dataall/modules/datasets/api/DatasetStorageLocation/resolvers.py similarity index 94% rename from backend/dataall/api/Objects/DatasetStorageLocation/resolvers.py rename to backend/dataall/modules/datasets/api/DatasetStorageLocation/resolvers.py index 1a4171444..5f73c468c 100644 --- a/backend/dataall/api/Objects/DatasetStorageLocation/resolvers.py +++ b/backend/dataall/modules/datasets/api/DatasetStorageLocation/resolvers.py @@ -1,15 +1,15 @@ -from ....api.context import Context -from ....aws.handlers.service_handlers import Worker -from ....aws.handlers.s3 import S3 -from ....db import permissions, models -from ....db.api import ( +from dataall.api.context import Context +from dataall.aws.handlers.service_handlers import Worker +from dataall.aws.handlers.s3 import S3 +from dataall.db import permissions, models +from dataall.db.api import ( ResourcePolicy, Glossary, DatasetStorageLocation, Dataset, Environment, ) -from ....searchproxy import indexers +from dataall.searchproxy import indexers def create_storage_location( diff --git a/backend/dataall/api/Objects/DatasetStorageLocation/schema.py b/backend/dataall/modules/datasets/api/DatasetStorageLocation/schema.py similarity index 95% rename from backend/dataall/api/Objects/DatasetStorageLocation/schema.py rename to backend/dataall/modules/datasets/api/DatasetStorageLocation/schema.py index d05309f0b..fab6c3bd1 100644 --- a/backend/dataall/api/Objects/DatasetStorageLocation/schema.py +++ b/backend/dataall/modules/datasets/api/DatasetStorageLocation/schema.py @@ -1,5 +1,8 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetStorageLocation.resolvers import ( + resolve_glossary_terms, + resolve_dataset +) DatasetStorageLocation = gql.ObjectType( name='DatasetStorageLocation', diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index 6bb6f8ab0..4657365df 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -1,7 +1,8 @@ """The GraphQL schema of datasets and related functionality""" from dataall.modules.datasets.api import ( table_column, - profiling + profiling, + DatasetStorageLocation ) -__all__ = ["table_column", "profiling"] +__all__ = ["table_column", "profiling", "DatasetStorageLocation"] From 56a3610116ae136579ead39bf70ba962b7592f8a Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 10:31:21 +0200 Subject: [PATCH 39/67] Datasets refactoring Renamed dataset storage location --- backend/dataall/modules/datasets/api/__init__.py | 4 ++-- .../__init__.py | 2 +- .../input_types.py | 0 .../mutations.py | 6 +++--- .../{DatasetStorageLocation => storage_location}/queries.py | 2 +- .../resolvers.py | 0 .../{DatasetStorageLocation => storage_location}/schema.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) rename backend/dataall/modules/datasets/api/{DatasetStorageLocation => storage_location}/__init__.py (69%) rename backend/dataall/modules/datasets/api/{DatasetStorageLocation => storage_location}/input_types.py (100%) rename backend/dataall/modules/datasets/api/{DatasetStorageLocation => storage_location}/mutations.py (84%) rename backend/dataall/modules/datasets/api/{DatasetStorageLocation => storage_location}/queries.py (74%) rename backend/dataall/modules/datasets/api/{DatasetStorageLocation => storage_location}/resolvers.py (100%) rename backend/dataall/modules/datasets/api/{DatasetStorageLocation => storage_location}/schema.py (97%) diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index 4657365df..4c279340e 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -2,7 +2,7 @@ from dataall.modules.datasets.api import ( table_column, profiling, - DatasetStorageLocation + storage_location ) -__all__ = ["table_column", "profiling", "DatasetStorageLocation"] +__all__ = ["table_column", "profiling", "storage_location"] diff --git a/backend/dataall/modules/datasets/api/DatasetStorageLocation/__init__.py b/backend/dataall/modules/datasets/api/storage_location/__init__.py similarity index 69% rename from backend/dataall/modules/datasets/api/DatasetStorageLocation/__init__.py rename to backend/dataall/modules/datasets/api/storage_location/__init__.py index e00ffe36f..e878410f5 100644 --- a/backend/dataall/modules/datasets/api/DatasetStorageLocation/__init__.py +++ b/backend/dataall/modules/datasets/api/storage_location/__init__.py @@ -1,4 +1,4 @@ -from dataall.modules.datasets.api.DatasetStorageLocation import ( +from dataall.modules.datasets.api.storage_location import ( input_types, mutations, queries, diff --git a/backend/dataall/modules/datasets/api/DatasetStorageLocation/input_types.py b/backend/dataall/modules/datasets/api/storage_location/input_types.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetStorageLocation/input_types.py rename to backend/dataall/modules/datasets/api/storage_location/input_types.py diff --git a/backend/dataall/modules/datasets/api/DatasetStorageLocation/mutations.py b/backend/dataall/modules/datasets/api/storage_location/mutations.py similarity index 84% rename from backend/dataall/modules/datasets/api/DatasetStorageLocation/mutations.py rename to backend/dataall/modules/datasets/api/storage_location/mutations.py index 10fc2ec40..14aafddc7 100644 --- a/backend/dataall/modules/datasets/api/DatasetStorageLocation/mutations.py +++ b/backend/dataall/modules/datasets/api/storage_location/mutations.py @@ -1,15 +1,15 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetStorageLocation.input_types import ( +from dataall.modules.datasets.api.storage_location.input_types import ( ModifyDatasetFolderInput, NewDatasetStorageLocationInput, ) -from dataall.modules.datasets.api.DatasetStorageLocation.resolvers import ( +from dataall.modules.datasets.api.storage_location.resolvers import ( create_storage_location, update_storage_location, remove_storage_location, publish_location_update ) -from dataall.modules.datasets.api.DatasetStorageLocation.schema import DatasetStorageLocation +from dataall.modules.datasets.api.storage_location.schema import DatasetStorageLocation createDatasetStorageLocation = gql.MutationField( name='createDatasetStorageLocation', diff --git a/backend/dataall/modules/datasets/api/DatasetStorageLocation/queries.py b/backend/dataall/modules/datasets/api/storage_location/queries.py similarity index 74% rename from backend/dataall/modules/datasets/api/DatasetStorageLocation/queries.py rename to backend/dataall/modules/datasets/api/storage_location/queries.py index 447225cfd..ea129a37d 100644 --- a/backend/dataall/modules/datasets/api/DatasetStorageLocation/queries.py +++ b/backend/dataall/modules/datasets/api/storage_location/queries.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetStorageLocation.resolvers import get_storage_location +from dataall.modules.datasets.api.storage_location.resolvers import get_storage_location getDatasetStorageLocation = gql.QueryField( name='getDatasetStorageLocation', diff --git a/backend/dataall/modules/datasets/api/DatasetStorageLocation/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetStorageLocation/resolvers.py rename to backend/dataall/modules/datasets/api/storage_location/resolvers.py diff --git a/backend/dataall/modules/datasets/api/DatasetStorageLocation/schema.py b/backend/dataall/modules/datasets/api/storage_location/schema.py similarity index 97% rename from backend/dataall/modules/datasets/api/DatasetStorageLocation/schema.py rename to backend/dataall/modules/datasets/api/storage_location/schema.py index fab6c3bd1..c9853a22f 100644 --- a/backend/dataall/modules/datasets/api/DatasetStorageLocation/schema.py +++ b/backend/dataall/modules/datasets/api/storage_location/schema.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetStorageLocation.resolvers import ( +from dataall.modules.datasets.api.storage_location.resolvers import ( resolve_glossary_terms, resolve_dataset ) From 47a38ccbaea0b3368b3cd5003208067081ccd1a6 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 10:37:27 +0200 Subject: [PATCH 40/67] Datasets refactoring Returned the name to model after renaming the service --- .../services/dataset_profiling_service.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/backend/dataall/modules/datasets/services/dataset_profiling_service.py b/backend/dataall/modules/datasets/services/dataset_profiling_service.py index 915a52eda..5b6ca8d41 100644 --- a/backend/dataall/modules/datasets/services/dataset_profiling_service.py +++ b/backend/dataall/modules/datasets/services/dataset_profiling_service.py @@ -2,7 +2,7 @@ from dataall.db import paginate, models from dataall.db.exceptions import ObjectNotFound -from dataall.modules.datasets.db.models import DatasetProfilingRun as DatasetProfilingRunModel +from dataall.modules.datasets.db.models import DatasetProfilingRun class DatasetProfilingService: @@ -31,7 +31,7 @@ def start_profiling( if not environment: raise ObjectNotFound('Environment', dataset.environmentUri) - run = DatasetProfilingRunModel( + run = DatasetProfilingRun( datasetUri=dataset.datasetUri, status='RUNNING', AwsAccountId=environment.AwsAccountId, @@ -73,14 +73,14 @@ def get_profiling_run( session, profilingRunUri=None, GlueJobRunId=None, GlueTableName=None ): if profilingRunUri: - run: DatasetProfilingRunModel = session.query( - DatasetProfilingRunModel + run: DatasetProfilingRun = session.query( + DatasetProfilingRun ).get(profilingRunUri) else: - run: DatasetProfilingRunModel = ( - session.query(DatasetProfilingRunModel) - .filter(DatasetProfilingRunModel.GlueJobRunId == GlueJobRunId) - .filter(DatasetProfilingRunModel.GlueTableName == GlueTableName) + run: DatasetProfilingRun = ( + session.query(DatasetProfilingRun) + .filter(DatasetProfilingRun.GlueJobRunId == GlueJobRunId) + .filter(DatasetProfilingRun.GlueTableName == GlueTableName) .first() ) return run @@ -90,9 +90,9 @@ def list_profiling_runs(session, datasetUri, filter: dict = None): if not filter: filter = {} q = ( - session.query(DatasetProfilingRunModel) - .filter(DatasetProfilingRunModel.datasetUri == datasetUri) - .order_by(DatasetProfilingRunModel.created.desc()) + session.query(DatasetProfilingRun) + .filter(DatasetProfilingRun.datasetUri == datasetUri) + .order_by(DatasetProfilingRun.created.desc()) ) return paginate( q, page=filter.get('page', 1), page_size=filter.get('pageSize', 20) @@ -103,19 +103,19 @@ def list_table_profiling_runs(session, tableUri, filter): if not filter: filter = {} q = ( - session.query(DatasetProfilingRunModel) + session.query(DatasetProfilingRun) .join( models.DatasetTable, - models.DatasetTable.datasetUri == DatasetProfilingRunModel.datasetUri, + models.DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, ) .filter( and_( models.DatasetTable.tableUri == tableUri, models.DatasetTable.GlueTableName - == DatasetProfilingRunModel.GlueTableName, + == DatasetProfilingRun.GlueTableName, ) ) - .order_by(DatasetProfilingRunModel.created.desc()) + .order_by(DatasetProfilingRun.created.desc()) ) return paginate( q, page=filter.get('page', 1), page_size=filter.get('pageSize', 20) @@ -124,34 +124,34 @@ def list_table_profiling_runs(session, tableUri, filter): @staticmethod def get_table_last_profiling_run(session, tableUri): return ( - session.query(DatasetProfilingRunModel) + session.query(DatasetProfilingRun) .join( models.DatasetTable, - models.DatasetTable.datasetUri == DatasetProfilingRunModel.datasetUri, + models.DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, ) .filter(models.DatasetTable.tableUri == tableUri) .filter( models.DatasetTable.GlueTableName - == DatasetProfilingRunModel.GlueTableName + == DatasetProfilingRun.GlueTableName ) - .order_by(DatasetProfilingRunModel.created.desc()) + .order_by(DatasetProfilingRun.created.desc()) .first() ) @staticmethod def get_table_last_profiling_run_with_results(session, tableUri): return ( - session.query(DatasetProfilingRunModel) + session.query(DatasetProfilingRun) .join( models.DatasetTable, - models.DatasetTable.datasetUri == DatasetProfilingRunModel.datasetUri, + models.DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, ) .filter(models.DatasetTable.tableUri == tableUri) .filter( models.DatasetTable.GlueTableName - == DatasetProfilingRunModel.GlueTableName + == DatasetProfilingRun.GlueTableName ) - .filter(DatasetProfilingRunModel.results.isnot(None)) - .order_by(DatasetProfilingRunModel.created.desc()) + .filter(DatasetProfilingRun.results.isnot(None)) + .order_by(DatasetProfilingRun.created.desc()) .first() ) From dbb55179baefc20027578bf89747a82ce192ee13 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 10:54:08 +0200 Subject: [PATCH 41/67] Datasets refactoring Moved DatasetStorageLocation into modules --- backend/dataall/api/Objects/Feed/registry.py | 1 - .../dataall/api/Objects/Glossary/registry.py | 1 - .../dataall/api/Objects/Glossary/resolvers.py | 4 +- .../api/Objects/ShareObject/resolvers.py | 3 +- backend/dataall/cdkproxy/stacks/dataset.py | 15 ++++---- backend/dataall/db/api/dataset.py | 21 ++++++----- backend/dataall/db/api/dataset_location.py | 37 ++++++++++--------- backend/dataall/db/api/environment.py | 20 +++++----- backend/dataall/db/api/share_object.py | 23 ++++++------ .../db/models/DatasetStorageLocation.py | 22 ----------- backend/dataall/db/models/__init__.py | 1 - backend/dataall/modules/datasets/__init__.py | 7 +++- .../api/storage_location/resolvers.py | 5 ++- backend/dataall/modules/datasets/db/models.py | 22 ++++++++++- .../datasets/tasks/subscription_service.py | 3 +- backend/dataall/searchproxy/indexers.py | 31 ++++++++-------- .../dataall/tasks/bucket_policy_updater.py | 15 ++++---- .../share_managers/s3_share_manager.py | 3 +- .../share_processors/s3_process_share.py | 7 ++-- backend/dataall/utils/alarm_service.py | 5 ++- tests/api/conftest.py | 5 ++- tests/api/test_dataset.py | 3 +- tests/searchproxy/test_indexers.py | 3 +- tests/tasks/conftest.py | 9 ++--- tests/tasks/test_s3_share_manager.py | 37 ++++++++++--------- 25 files changed, 161 insertions(+), 142 deletions(-) delete mode 100644 backend/dataall/db/models/DatasetStorageLocation.py diff --git a/backend/dataall/api/Objects/Feed/registry.py b/backend/dataall/api/Objects/Feed/registry.py index a119529ab..6a01a488a 100644 --- a/backend/dataall/api/Objects/Feed/registry.py +++ b/backend/dataall/api/Objects/Feed/registry.py @@ -39,5 +39,4 @@ def types(cls): FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) -FeedRegistry.register(FeedDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/api/Objects/Glossary/registry.py b/backend/dataall/api/Objects/Glossary/registry.py index 375f470e2..7c42e4f4c 100644 --- a/backend/dataall/api/Objects/Glossary/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -52,7 +52,6 @@ def types(cls): GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) -GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", models.DatasetStorageLocation)) GlossaryRegistry.register(GlossaryDefinition("Dashboard", "Dashboard", models.Dashboard)) GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) GlossaryRegistry.register(GlossaryDefinition("Dataset", "Dataset", models.Dataset)) diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index 15e77327f..959578600 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -13,6 +13,8 @@ GlossaryRole ) +from dataall.modules.datasets.db.models import DatasetStorageLocation + def resolve_glossary_node(obj: models.GlossaryNode, *_): if obj.nodeType == 'G': @@ -465,7 +467,7 @@ def reindex(context, linkUri): upsert_dataset(session=session, es=context.es, datasetUri=link.targetUri) elif isinstance(target, models.DatasetTable): upsert_table(session=session, es=context.es, tableUri=link.targetUri) - elif isinstance(target, models.DatasetStorageLocation): + elif isinstance(target, DatasetStorageLocation): upsert_folder(session=session, es=context.es, locationUri=link.targetUri) elif isinstance(target, models.Dashboard): upsert_dashboard(session=session, es=context.es, dashboardUri=link.targetUri) diff --git a/backend/dataall/api/Objects/ShareObject/resolvers.py b/backend/dataall/api/Objects/ShareObject/resolvers.py index 6bbb64bf4..16e4e1353 100644 --- a/backend/dataall/api/Objects/ShareObject/resolvers.py +++ b/backend/dataall/api/Objects/ShareObject/resolvers.py @@ -7,6 +7,7 @@ from ....api.context import Context from ....aws.handlers.service_handlers import Worker from ....db import models +from dataall.modules.datasets.db.models import DatasetStorageLocation log = logging.getLogger(__name__) @@ -266,7 +267,7 @@ def resolve_dataset(context: Context, source: models.ShareObject, **kwargs): def union_resolver(object, *_): if isinstance(object, models.DatasetTable): return 'DatasetTable' - elif isinstance(object, models.DatasetStorageLocation): + elif isinstance(object, DatasetStorageLocation): return 'DatasetStorageLocation' diff --git a/backend/dataall/cdkproxy/stacks/dataset.py b/backend/dataall/cdkproxy/stacks/dataset.py index 133b5f928..852cba66b 100644 --- a/backend/dataall/cdkproxy/stacks/dataset.py +++ b/backend/dataall/cdkproxy/stacks/dataset.py @@ -28,6 +28,7 @@ from ...db.api import Environment from ...utils.cdk_nag_utils import CDKNagUtil from ...utils.runtime_stacks_tagging import TagsUtil +from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @@ -110,14 +111,14 @@ def get_shared_tables(self) -> typing.List[models.ShareObjectItem]: logger.info(f'found {len(tables)} shared tables') return tables - def get_shared_folders(self) -> typing.List[models.DatasetStorageLocation]: + def get_shared_folders(self) -> typing.List[DatasetStorageLocation]: engine = self.get_engine() with engine.scoped_session() as session: locations = ( session.query( - models.DatasetStorageLocation.locationUri.label('locationUri'), - models.DatasetStorageLocation.S3BucketName.label('S3BucketName'), - models.DatasetStorageLocation.S3Prefix.label('S3Prefix'), + DatasetStorageLocation.locationUri.label('locationUri'), + DatasetStorageLocation.S3BucketName.label('S3BucketName'), + DatasetStorageLocation.S3Prefix.label('S3Prefix'), models.Environment.AwsAccountId.label('AwsAccountId'), models.Environment.region.label('region'), ) @@ -125,7 +126,7 @@ def get_shared_folders(self) -> typing.List[models.DatasetStorageLocation]: models.ShareObjectItem, and_( models.ShareObjectItem.itemUri - == models.DatasetStorageLocation.locationUri + == DatasetStorageLocation.locationUri ), ) .join( @@ -139,8 +140,8 @@ def get_shared_folders(self) -> typing.List[models.DatasetStorageLocation]: ) .filter( and_( - models.DatasetStorageLocation.datasetUri == self.target_uri, - models.DatasetStorageLocation.deleted.is_(None), + DatasetStorageLocation.datasetUri == self.target_uri, + DatasetStorageLocation.deleted.is_(None), models.ShareObjectItem.status.in_(self.shared_states) ) ) diff --git a/backend/dataall/db/api/dataset.py b/backend/dataall/db/api/dataset.py index 8fdbb72b7..f78d92eae 100644 --- a/backend/dataall/db/api/dataset.py +++ b/backend/dataall/db/api/dataset.py @@ -20,6 +20,7 @@ NamingConventionService, NamingConventionPattern, ) +from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @@ -266,17 +267,17 @@ def paginated_user_datasets( def paginated_dataset_locations( session, username, groups, uri, data=None, check_perm=None ) -> dict: - query = session.query(models.DatasetStorageLocation).filter( - models.DatasetStorageLocation.datasetUri == uri + query = session.query(DatasetStorageLocation).filter( + DatasetStorageLocation.datasetUri == uri ) if data and data.get('term'): query = query.filter( or_( *[ - models.DatasetStorageLocation.name.ilike( + DatasetStorageLocation.name.ilike( '%' + data.get('term') + '%' ), - models.DatasetStorageLocation.S3Prefix.ilike( + DatasetStorageLocation.S3Prefix.ilike( '%' + data.get('term') + '%' ), ] @@ -489,8 +490,8 @@ def get_dataset_tables(session, dataset_uri): def get_dataset_folders(session, dataset_uri): """return the dataset folders""" return ( - session.query(models.DatasetStorageLocation) - .filter(models.DatasetStorageLocation.datasetUri == dataset_uri) + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == dataset_uri) .all() ) @@ -634,10 +635,10 @@ def _delete_dataset_tables(session, dataset_uri) -> bool: @staticmethod def _delete_dataset_locations(session, dataset_uri) -> bool: locations = ( - session.query(models.DatasetStorageLocation) + session.query(DatasetStorageLocation) .filter( and_( - models.DatasetStorageLocation.datasetUri == dataset_uri, + DatasetStorageLocation.datasetUri == dataset_uri, ) ) .all() @@ -675,7 +676,7 @@ def count_dataset_tables(session, dataset_uri): @staticmethod def count_dataset_locations(session, dataset_uri): return ( - session.query(models.DatasetStorageLocation) - .filter(models.DatasetStorageLocation.datasetUri == dataset_uri) + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == dataset_uri) .count() ) diff --git a/backend/dataall/db/api/dataset_location.py b/backend/dataall/db/api/dataset_location.py index ef9f085f3..e19f1dfb0 100644 --- a/backend/dataall/db/api/dataset_location.py +++ b/backend/dataall/db/api/dataset_location.py @@ -6,6 +6,7 @@ from . import has_tenant_perm, has_resource_perm, Glossary from .. import models, api, paginate, permissions, exceptions from .dataset import Dataset +from dataall.modules.datasets.db.models import DatasetStorageLocation as DatasetStorageLocationModel logger = logging.getLogger(__name__) @@ -21,14 +22,14 @@ def create_dataset_location( uri: str, data: dict = None, check_perm: bool = False, - ) -> models.DatasetStorageLocation: + ) -> DatasetStorageLocationModel: dataset = Dataset.get_dataset_by_uri(session, uri) exists = ( - session.query(models.DatasetStorageLocation) + session.query(DatasetStorageLocationModel) .filter( and_( - models.DatasetStorageLocation.datasetUri == dataset.datasetUri, - models.DatasetStorageLocation.S3Prefix == data['prefix'], + DatasetStorageLocationModel.datasetUri == dataset.datasetUri, + DatasetStorageLocationModel.S3Prefix == data['prefix'], ) ) .count() @@ -40,7 +41,7 @@ def create_dataset_location( message=f'Folder: {data["prefix"]} already exist on dataset {uri}', ) - location = models.DatasetStorageLocation( + location = DatasetStorageLocationModel( datasetUri=dataset.datasetUri, label=data.get('label'), description=data.get('description', 'No description provided'), @@ -77,14 +78,14 @@ def list_dataset_locations( check_perm: bool = False, ) -> dict: query = ( - session.query(models.DatasetStorageLocation) - .filter(models.DatasetStorageLocation.datasetUri == uri) - .order_by(models.DatasetStorageLocation.created.desc()) + session.query(DatasetStorageLocationModel) + .filter(DatasetStorageLocationModel.datasetUri == uri) + .order_by(DatasetStorageLocationModel.created.desc()) ) if data.get('term'): term = data.get('term') query = query.filter( - models.DatasetStorageLocation.label.ilike('%' + term + '%') + DatasetStorageLocationModel.label.ilike('%' + term + '%') ) return paginate( query, page=data.get('page', 1), page_size=data.get('pageSize', 10) @@ -100,7 +101,7 @@ def get_dataset_location( uri: str, data: dict = None, check_perm: bool = False, - ) -> models.DatasetStorageLocation: + ) -> DatasetStorageLocationModel: return DatasetStorageLocation.get_location_by_uri(session, data['locationUri']) @staticmethod @@ -113,7 +114,7 @@ def update_dataset_location( uri: str, data: dict = None, check_perm: bool = False, - ) -> models.DatasetStorageLocation: + ) -> DatasetStorageLocationModel: location = data.get( 'location', @@ -176,9 +177,9 @@ def delete_dataset_location( return True @staticmethod - def get_location_by_uri(session, location_uri) -> models.DatasetStorageLocation: + def get_location_by_uri(session, location_uri) -> DatasetStorageLocationModel: location: DatasetStorageLocation = session.query( - models.DatasetStorageLocation + DatasetStorageLocationModel ).get(location_uri) if not location: raise exceptions.ObjectNotFound('Folder', location_uri) @@ -186,13 +187,13 @@ def get_location_by_uri(session, location_uri) -> models.DatasetStorageLocation: @staticmethod def get_location_by_s3_prefix(session, s3_prefix, accountid, region): - location: models.DatasetStorageLocation = ( - session.query(models.DatasetStorageLocation) + location: DatasetStorageLocationModel = ( + session.query(DatasetStorageLocationModel) .filter( and_( - models.DatasetStorageLocation.S3Prefix.startswith(s3_prefix), - models.DatasetStorageLocation.AWSAccountId == accountid, - models.DatasetStorageLocation.region == region, + DatasetStorageLocationModel.S3Prefix.startswith(s3_prefix), + DatasetStorageLocationModel.AWSAccountId == accountid, + DatasetStorageLocationModel.region == region, ) ) .first() diff --git a/backend/dataall/db/api/environment.py b/backend/dataall/db/api/environment.py index 19d5de342..ac3777e5d 100644 --- a/backend/dataall/db/api/environment.py +++ b/backend/dataall/db/api/environment.py @@ -29,6 +29,8 @@ NamingConventionPattern, ) +from dataall.modules.datasets.db.models import DatasetStorageLocation + log = logging.getLogger(__name__) @@ -905,7 +907,7 @@ def paginated_shared_with_environment_datasets( ( models.ShareObjectItem.itemType == ShareableType.StorageLocation.value, - func.concat(models.DatasetStorageLocation.name), + func.concat(DatasetStorageLocation.name), ), ], else_='XXX XXXX', @@ -933,9 +935,9 @@ def paginated_shared_with_environment_datasets( models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, ) .outerjoin( - models.DatasetStorageLocation, + DatasetStorageLocation, models.ShareObjectItem.itemUri - == models.DatasetStorageLocation.locationUri, + == DatasetStorageLocation.locationUri, ) .filter( and_( @@ -1001,7 +1003,7 @@ def paginated_shared_with_environment_group_datasets( ( models.ShareObjectItem.itemType == ShareableType.StorageLocation.value, - func.concat(models.DatasetStorageLocation.name), + func.concat(DatasetStorageLocation.name), ), ], else_='XXX XXXX', @@ -1029,9 +1031,9 @@ def paginated_shared_with_environment_group_datasets( models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, ) .outerjoin( - models.DatasetStorageLocation, + DatasetStorageLocation, models.ShareObjectItem.itemUri - == models.DatasetStorageLocation.locationUri, + == DatasetStorageLocation.locationUri, ) .filter( and_( @@ -1122,7 +1124,7 @@ def paginated_environment_data_items( ( models.ShareObjectItem.itemType == ShareableType.StorageLocation.value, - func.concat(models.DatasetStorageLocation.name), + func.concat(DatasetStorageLocation.name), ), ], else_='XXX XXXX', @@ -1150,9 +1152,9 @@ def paginated_environment_data_items( models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, ) .outerjoin( - models.DatasetStorageLocation, + DatasetStorageLocation, models.ShareObjectItem.itemUri - == models.DatasetStorageLocation.locationUri, + == DatasetStorageLocation.locationUri, ) .filter( and_( diff --git a/backend/dataall/db/api/share_object.py b/backend/dataall/db/api/share_object.py index ff1c426d7..bd0215190 100644 --- a/backend/dataall/db/api/share_object.py +++ b/backend/dataall/db/api/share_object.py @@ -10,6 +10,7 @@ from .. import api, utils from .. import models, exceptions, permissions, paginate from ..models.Enums import ShareObjectStatus, ShareItemStatus, ShareObjectActions, ShareItemActions, ShareableType, PrincipalType +from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @@ -419,7 +420,7 @@ def create_share_object( item = None if itemType: if itemType == ShareableType.StorageLocation.value: - item = session.query(models.DatasetStorageLocation).get(itemUri) + item = session.query(DatasetStorageLocation).get(itemUri) if itemType == ShareableType.Table.value: item = session.query(models.DatasetTable).get(itemUri) @@ -718,7 +719,7 @@ def get_share_item( if share_item.itemType == ShareableType.Table.value: return session.query(models.DatasetTable).get(share_item.itemUri) if share_item.itemType == ShareableType.StorageLocation: - return session.Query(models.DatasetStorageLocation).get(share_item.itemUri) + return session.Query(DatasetStorageLocation).get(share_item.itemUri) @staticmethod def get_share_by_uri(session, uri): @@ -771,7 +772,7 @@ def add_share_object_item( ) elif itemType == ShareableType.StorageLocation.value: - item = session.query(models.DatasetStorageLocation).get(itemUri) + item = session.query(DatasetStorageLocation).get(itemUri) if not item: raise exceptions.ObjectNotFound('ShareObjectItem', itemUri) @@ -971,10 +972,10 @@ def list_shareable_items( # marking the folder as part of the shareObject locations = ( session.query( - models.DatasetStorageLocation.locationUri.label('itemUri'), + DatasetStorageLocation.locationUri.label('itemUri'), func.coalesce('DatasetStorageLocation').label('itemType'), - models.DatasetStorageLocation.S3Prefix.label('itemName'), - models.DatasetStorageLocation.description.label('description'), + DatasetStorageLocation.S3Prefix.label('itemName'), + DatasetStorageLocation.description.label('description'), models.ShareObjectItem.shareItemUri.label('shareItemUri'), models.ShareObjectItem.status.label('status'), case( @@ -986,11 +987,11 @@ def list_shareable_items( models.ShareObjectItem, and_( models.ShareObjectItem.shareUri == share.shareUri, - models.DatasetStorageLocation.locationUri + DatasetStorageLocation.locationUri == models.ShareObjectItem.itemUri, ), ) - .filter(models.DatasetStorageLocation.datasetUri == datasetUri) + .filter(DatasetStorageLocation.datasetUri == datasetUri) ) if data: if data.get("isRevokable"): @@ -1162,7 +1163,7 @@ def find_share_item_by_table( def find_share_item_by_folder( session, share: models.ShareObject, - folder: models.DatasetStorageLocation, + folder: DatasetStorageLocation, ) -> models.ShareObjectItem: share_item: models.ShareObjectItem = ( session.query(models.ShareObjectItem) @@ -1268,10 +1269,10 @@ def get_share_data_items(session, share_uri, status): ) folders = ( - session.query(models.DatasetStorageLocation) + session.query(DatasetStorageLocation) .join( models.ShareObjectItem, - models.ShareObjectItem.itemUri == models.DatasetStorageLocation.locationUri, + models.ShareObjectItem.itemUri == DatasetStorageLocation.locationUri, ) .join( models.ShareObject, diff --git a/backend/dataall/db/models/DatasetStorageLocation.py b/backend/dataall/db/models/DatasetStorageLocation.py deleted file mode 100644 index e21ae6694..000000000 --- a/backend/dataall/db/models/DatasetStorageLocation.py +++ /dev/null @@ -1,22 +0,0 @@ -from sqlalchemy import Boolean, Column, String -from sqlalchemy.orm import query_expression - -from .. import Base, Resource, utils - - -class DatasetStorageLocation(Resource, Base): - __tablename__ = 'dataset_storage_location' - datasetUri = Column(String, nullable=False) - locationUri = Column(String, primary_key=True, default=utils.uuid('location')) - AWSAccountId = Column(String, nullable=False) - S3BucketName = Column(String, nullable=False) - S3Prefix = Column(String, nullable=False) - S3AccessPoint = Column(String, nullable=True) - region = Column(String, default='eu-west-1') - locationCreated = Column(Boolean, default=False) - userRoleForStorageLocation = query_expression() - projectPermission = query_expression() - environmentEndPoint = query_expression() - - def uri(self): - return self.locationUri diff --git a/backend/dataall/db/models/__init__.py b/backend/dataall/db/models/__init__.py index 0af480d79..c288527cf 100644 --- a/backend/dataall/db/models/__init__.py +++ b/backend/dataall/db/models/__init__.py @@ -6,7 +6,6 @@ from .DashboardShare import DashboardShareStatus from .Dataset import Dataset from .DatasetQualityRule import DatasetQualityRule -from .DatasetStorageLocation import DatasetStorageLocation from .DatasetTable import DatasetTable from .Environment import Environment from .EnvironmentGroup import EnvironmentGroup diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 4620495fe..842eba82b 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -4,7 +4,7 @@ from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition -from dataall.modules.datasets.db.models import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation from dataall.modules.loader import ModuleInterface, ImportMode log = logging.getLogger(__name__) @@ -19,8 +19,13 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api + FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) + FeedRegistry.register(FeedDefinition("DatasetStorageLocation", DatasetStorageLocation)) + GlossaryRegistry.register(GlossaryDefinition("Column", "DatasetTableColumn", DatasetTableColumn)) + GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", DatasetStorageLocation)) + log.info("API of datasets has been imported") diff --git a/backend/dataall/modules/datasets/api/storage_location/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py index 5f73c468c..2eb18198c 100644 --- a/backend/dataall/modules/datasets/api/storage_location/resolvers.py +++ b/backend/dataall/modules/datasets/api/storage_location/resolvers.py @@ -10,6 +10,7 @@ Environment, ) from dataall.searchproxy import indexers +from dataall.modules.datasets.db.models import DatasetStorageLocation def create_storage_location( @@ -92,7 +93,7 @@ def remove_storage_location(context, source, locationUri: str = None): return True -def resolve_dataset(context, source: models.DatasetStorageLocation, **kwargs): +def resolve_dataset(context, source: DatasetStorageLocation, **kwargs): if not source: return None with context.engine.scoped_session() as session: @@ -129,7 +130,7 @@ def publish_location_update(context: Context, source, locationUri: str = None): def resolve_glossary_terms( - context: Context, source: models.DatasetStorageLocation, **kwargs + context: Context, source: DatasetStorageLocation, **kwargs ): if not source: return None diff --git a/backend/dataall/modules/datasets/db/models.py b/backend/dataall/modules/datasets/db/models.py index 1ba60bea1..2dfee26ec 100644 --- a/backend/dataall/modules/datasets/db/models.py +++ b/backend/dataall/modules/datasets/db/models.py @@ -1,5 +1,6 @@ -from sqlalchemy import Column, String +from sqlalchemy import Boolean, Column, String from sqlalchemy.dialects.postgresql import JSON +from sqlalchemy.orm import query_expression from dataall.db import Base, Resource, utils @@ -36,3 +37,22 @@ class DatasetProfilingRun(Resource, Base): AwsAccountId = Column(String) results = Column(JSON, default={}) status = Column(String, default='Created') + + +class DatasetStorageLocation(Resource, Base): + __tablename__ = 'dataset_storage_location' + datasetUri = Column(String, nullable=False) + locationUri = Column(String, primary_key=True, default=utils.uuid('location')) + AWSAccountId = Column(String, nullable=False) + S3BucketName = Column(String, nullable=False) + S3Prefix = Column(String, nullable=False) + S3AccessPoint = Column(String, nullable=True) + region = Column(String, default='eu-west-1') + locationCreated = Column(Boolean, default=False) + userRoleForStorageLocation = query_expression() + projectPermission = query_expression() + environmentEndPoint = query_expression() + + def uri(self): + return self.locationUri + diff --git a/backend/dataall/modules/datasets/tasks/subscription_service.py b/backend/dataall/modules/datasets/tasks/subscription_service.py index 74f84d7c9..7df028914 100644 --- a/backend/dataall/modules/datasets/tasks/subscription_service.py +++ b/backend/dataall/modules/datasets/tasks/subscription_service.py @@ -16,6 +16,7 @@ from dataall.tasks.subscriptions import poll_queues from dataall.utils import json_utils from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.models import DatasetStorageLocation root = logging.getLogger() root.setLevel(logging.INFO) @@ -103,7 +104,7 @@ def publish_table_update_message(engine, message): @staticmethod def publish_location_update_message(session, message): - location: models.DatasetStorageLocation = ( + location: DatasetStorageLocation = ( db.api.DatasetStorageLocation.get_location_by_s3_prefix( session, message.get('prefix'), diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 78886716d..7361c2150 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -6,6 +6,7 @@ from .upsert import upsert from .. import db from ..db import models +from dataall.modules.datasets.db.models import DatasetStorageLocation log = logging.getLogger(__name__) @@ -184,14 +185,14 @@ def upsert_table(session, es, tableUri: str): def upsert_folder(session, es, locationUri: str): folder = ( session.query( - models.DatasetStorageLocation.datasetUri.label('datasetUri'), - models.DatasetStorageLocation.locationUri.label('uri'), - models.DatasetStorageLocation.name.label('name'), - models.DatasetStorageLocation.owner.label('owner'), - models.DatasetStorageLocation.label.label('label'), - models.DatasetStorageLocation.description.label('description'), - models.DatasetStorageLocation.tags.label('tags'), - models.DatasetStorageLocation.region.label('region'), + DatasetStorageLocation.datasetUri.label('datasetUri'), + DatasetStorageLocation.locationUri.label('uri'), + DatasetStorageLocation.name.label('name'), + DatasetStorageLocation.owner.label('owner'), + DatasetStorageLocation.label.label('label'), + DatasetStorageLocation.description.label('description'), + DatasetStorageLocation.tags.label('tags'), + DatasetStorageLocation.region.label('region'), models.Organization.organizationUri.label('orgUri'), models.Organization.name.label('orgName'), models.Environment.environmentUri.label('envUri'), @@ -200,13 +201,13 @@ def upsert_folder(session, es, locationUri: str): models.Dataset.S3BucketName.label('source'), models.Dataset.topics.label('topics'), models.Dataset.confidentiality.label('classification'), - models.DatasetStorageLocation.created, - models.DatasetStorageLocation.updated, - models.DatasetStorageLocation.deleted, + DatasetStorageLocation.created, + DatasetStorageLocation.updated, + DatasetStorageLocation.deleted, ) .join( models.Dataset, - models.Dataset.datasetUri == models.DatasetStorageLocation.datasetUri, + models.Dataset.datasetUri == DatasetStorageLocation.datasetUri, ) .join( models.Organization, @@ -216,7 +217,7 @@ def upsert_folder(session, es, locationUri: str): models.Environment, models.Dataset.environmentUri == models.Environment.environmentUri, ) - .filter(models.DatasetStorageLocation.locationUri == locationUri) + .filter(DatasetStorageLocation.locationUri == locationUri) .first() ) if folder: @@ -349,8 +350,8 @@ def remove_deleted_tables(session, es, datasetUri: str): def upsert_dataset_folders(session, es, datasetUri: str): folders = ( - session.query(models.DatasetStorageLocation) - .filter(models.DatasetStorageLocation.datasetUri == datasetUri) + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == datasetUri) .all() ) for folder in folders: diff --git a/backend/dataall/tasks/bucket_policy_updater.py b/backend/dataall/tasks/bucket_policy_updater.py index 5b8f322be..9932f53ae 100644 --- a/backend/dataall/tasks/bucket_policy_updater.py +++ b/backend/dataall/tasks/bucket_policy_updater.py @@ -10,6 +10,7 @@ from ..aws.handlers.sts import SessionHelper from ..db import get_engine from ..db import models, api +from dataall.modules.datasets.db.models import DatasetStorageLocation root = logging.getLogger() root.setLevel(logging.INFO) @@ -201,13 +202,13 @@ def get_shared_tables(self, dataset) -> typing.List[models.ShareObjectItem]: ).all() return tables - def get_shared_folders(self, dataset) -> typing.List[models.DatasetStorageLocation]: + def get_shared_folders(self, dataset) -> typing.List[DatasetStorageLocation]: with self.engine.scoped_session() as session: locations = ( session.query( - models.DatasetStorageLocation.locationUri.label('locationUri'), - models.DatasetStorageLocation.S3BucketName.label('S3BucketName'), - models.DatasetStorageLocation.S3Prefix.label('S3Prefix'), + DatasetStorageLocation.locationUri.label('locationUri'), + DatasetStorageLocation.S3BucketName.label('S3BucketName'), + DatasetStorageLocation.S3Prefix.label('S3Prefix'), models.Environment.AwsAccountId.label('AwsAccountId'), models.Environment.region.label('region'), ) @@ -215,7 +216,7 @@ def get_shared_folders(self, dataset) -> typing.List[models.DatasetStorageLocati models.ShareObjectItem, and_( models.ShareObjectItem.itemUri - == models.DatasetStorageLocation.locationUri + == DatasetStorageLocation.locationUri ), ) .join( @@ -229,8 +230,8 @@ def get_shared_folders(self, dataset) -> typing.List[models.DatasetStorageLocati ) .filter( and_( - models.DatasetStorageLocation.datasetUri == dataset.datasetUri, - models.DatasetStorageLocation.deleted.is_(None), + DatasetStorageLocation.datasetUri == dataset.datasetUri, + DatasetStorageLocation.deleted.is_(None), models.ShareObjectItem.status == models.Enums.ShareObjectStatus.Approved.value, ) diff --git a/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py b/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py index 1323770a4..f0ea4e162 100644 --- a/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py +++ b/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py @@ -10,6 +10,7 @@ from ....aws.handlers.iam import IAM from ....utils.alarm_service import AlarmService +from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) ACCESS_POINT_CREATION_TIME = 30 @@ -22,7 +23,7 @@ def __init__( session, dataset: models.Dataset, share: models.ShareObject, - target_folder: models.DatasetStorageLocation, + target_folder: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, source_env_group: models.EnvironmentGroup, diff --git a/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py b/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py index 6940d3392..96b608338 100644 --- a/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py +++ b/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py @@ -2,6 +2,7 @@ from ....db import models, api from ..share_managers import S3ShareManager +from dataall.modules.datasets.db.models import DatasetStorageLocation log = logging.getLogger(__name__) @@ -13,7 +14,7 @@ def __init__( session, dataset: models.Dataset, share: models.ShareObject, - share_folder: models.DatasetStorageLocation, + share_folder: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, source_env_group: models.EnvironmentGroup, @@ -37,7 +38,7 @@ def process_approved_shares( session, dataset: models.Dataset, share: models.ShareObject, - share_folders: [models.DatasetStorageLocation], + share_folders: [DatasetStorageLocation], source_environment: models.Environment, target_environment: models.Environment, source_env_group: models.EnvironmentGroup, @@ -104,7 +105,7 @@ def process_revoked_shares( session, dataset: models.Dataset, share: models.ShareObject, - revoke_folders: [models.DatasetStorageLocation], + revoke_folders: [DatasetStorageLocation], source_environment: models.Environment, target_environment: models.Environment, source_env_group: models.EnvironmentGroup, diff --git a/backend/dataall/utils/alarm_service.py b/backend/dataall/utils/alarm_service.py index 838029d3e..436d5a701 100644 --- a/backend/dataall/utils/alarm_service.py +++ b/backend/dataall/utils/alarm_service.py @@ -11,6 +11,7 @@ from ..aws.handlers.sts import SessionHelper from ..db import models +from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @@ -74,7 +75,7 @@ def trigger_table_sharing_failure_alarm( def trigger_folder_sharing_failure_alarm( self, - folder: models.DatasetStorageLocation, + folder: DatasetStorageLocation, share: models.ShareObject, target_environment: models.Environment, ): @@ -101,7 +102,7 @@ def trigger_folder_sharing_failure_alarm( def trigger_revoke_folder_sharing_failure_alarm( self, - folder: models.DatasetStorageLocation, + folder: DatasetStorageLocation, share: models.ShareObject, target_environment: models.Environment, ): diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 65dc6934b..8334f7700 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -1,6 +1,7 @@ from .client import * from dataall.db import models from dataall.api import constants +from dataall.modules.datasets.db.models import DatasetStorageLocation @pytest.fixture(scope='module', autouse=True) @@ -521,12 +522,12 @@ def factory( def location(db): cache = {} - def factory(dataset: models.Dataset, name, username) -> models.DatasetStorageLocation: + def factory(dataset: models.Dataset, name, username) -> DatasetStorageLocation: key = f'{dataset.datasetUri}-{name}' if cache.get(key): return cache.get(key) with db.scoped_session() as session: - ds_location = models.DatasetStorageLocation( + ds_location = DatasetStorageLocation( name=name, label=name, owner=username, diff --git a/tests/api/test_dataset.py b/tests/api/test_dataset.py index 057ff66a3..359a780b4 100644 --- a/tests/api/test_dataset.py +++ b/tests/api/test_dataset.py @@ -3,6 +3,7 @@ import pytest import dataall +from dataall.modules.datasets.db.models import DatasetStorageLocation @pytest.fixture(scope='module', autouse=True) @@ -235,7 +236,7 @@ def test_add_locations(location, dataset1, db): location(dataset=dataset1, name=f'unstructured{i+1}', username=dataset1.owner) with db.scoped_session() as session: - nb = session.query(dataall.db.models.DatasetStorageLocation).count() + nb = session.query(DatasetStorageLocation).count() assert nb == 10 diff --git a/tests/searchproxy/test_indexers.py b/tests/searchproxy/test_indexers.py index 478c8bf3c..fcdb18bb0 100644 --- a/tests/searchproxy/test_indexers.py +++ b/tests/searchproxy/test_indexers.py @@ -4,6 +4,7 @@ import dataall from dataall.searchproxy import indexers +from dataall.modules.datasets.db.models import DatasetStorageLocation @pytest.fixture(scope='module', autouse=True) @@ -89,7 +90,7 @@ def table(org, env, db, dataset): @pytest.fixture(scope='module', autouse=True) def folder(org, env, db, dataset): with db.scoped_session() as session: - location = dataall.db.models.DatasetStorageLocation( + location = DatasetStorageLocation( datasetUri=dataset.datasetUri, AWSAccountId='12345678901', S3Prefix='S3prefix', diff --git a/tests/tasks/conftest.py b/tests/tasks/conftest.py index 826ae524f..7e6f0d71a 100644 --- a/tests/tasks/conftest.py +++ b/tests/tasks/conftest.py @@ -1,9 +1,8 @@ -import boto3 -import os import pytest from dataall.db import models from dataall.api import constants +from dataall.modules.datasets.db.models import DatasetStorageLocation @pytest.fixture(scope="module") @@ -128,10 +127,10 @@ def factory( @pytest.fixture(scope="module") def location(db): - def factory(dataset: models.Dataset, label: str) -> models.DatasetStorageLocation: + def factory(dataset: models.Dataset, label: str) -> DatasetStorageLocation: with db.scoped_session() as session: - ds_location = models.DatasetStorageLocation( + ds_location = DatasetStorageLocation( name=label, label=label, owner=dataset.owner, @@ -198,7 +197,7 @@ def factory( def share_item_folder(db): def factory( share: models.ShareObject, - location: models.DatasetStorageLocation, + location: DatasetStorageLocation, ) -> models.ShareObjectItem: with db.scoped_session() as session: share_item = models.ShareObjectItem( diff --git a/tests/tasks/test_s3_share_manager.py b/tests/tasks/test_s3_share_manager.py index 53c7f426b..2841be87e 100644 --- a/tests/tasks/test_s3_share_manager.py +++ b/tests/tasks/test_s3_share_manager.py @@ -7,6 +7,7 @@ from dataall.tasks.data_sharing.share_managers.s3_share_manager import S3ShareManager from dataall.utils.alarm_service import AlarmService +from dataall.modules.datasets.db.models import DatasetStorageLocation SOURCE_ENV_ACCOUNT = "111111111111" @@ -68,7 +69,7 @@ def dataset1(dataset: Callable, org1: models.Organization, source_environment: m @pytest.fixture(scope="module") -def location1(location: Callable, dataset1: models.Dataset) -> models.DatasetStorageLocation: +def location1(location: Callable, dataset1: models.Dataset) -> DatasetStorageLocation: yield location(dataset1, "location1") @@ -81,7 +82,7 @@ def share1(share: Callable, dataset1: models.Dataset, @pytest.fixture(scope="module") -def share_item_folder1(share_item_folder: Callable, share1: models.ShareObject, location1: models.DatasetStorageLocation): +def share_item_folder1(share_item_folder: Callable, share1: models.ShareObject, location1: DatasetStorageLocation): share_item_folder1 = share_item_folder(share1, location1) return share_item_folder1 @@ -383,7 +384,7 @@ def test_grant_target_role_access_policy_test_no_policy( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -445,7 +446,7 @@ def test_update_dataset_bucket_key_policy_with_env_admin( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -562,7 +563,7 @@ def test_update_dataset_bucket_key_policy_without_env_admin( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -642,7 +643,7 @@ def test_manage_access_point_and_policy_1( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -733,7 +734,7 @@ def test_manage_access_point_and_policy_2( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -807,7 +808,7 @@ def test_manage_access_point_and_policy_3( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -878,7 +879,7 @@ def test_delete_access_point_policy_with_env_admin_one_prefix( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -950,7 +951,7 @@ def test_delete_access_point_policy_with_env_admin_multiple_prefix( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1017,7 +1018,7 @@ def test_dont_delete_access_point_with_policy( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1063,7 +1064,7 @@ def test_delete_access_point_without_policy( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1109,7 +1110,7 @@ def test_delete_target_role_access_policy_no_remaining_statement( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1174,7 +1175,7 @@ def test_delete_target_role_access_policy_with_remaining_statement( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1260,7 +1261,7 @@ def test_delete_dataset_bucket_key_policy_existing_policy_with_additional_target db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1351,7 +1352,7 @@ def test_delete_dataset_bucket_key_policy_existing_policy_with_no_additional_tar db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1424,7 +1425,7 @@ def test_handle_share_failure( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): @@ -1459,7 +1460,7 @@ def test_handle_revoke_failure( db, share1: models.ShareObject, share_item_folder1: models.ShareObjectItem, - location1: models.DatasetStorageLocation, + location1: DatasetStorageLocation, source_environment: models.Environment, target_environment: models.Environment, ): From b2566786250928b9328b2f4b89cfa725e34e5793 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 11:48:11 +0200 Subject: [PATCH 42/67] Datasets refactoring Moved DatasetStorageLocation into dataset services --- backend/dataall/aws/handlers/s3.py | 3 +- backend/dataall/db/api/__init__.py | 1 - .../api/storage_location/resolvers.py | 20 +++---- .../datasets/services}/dataset_location.py | 52 +++++++++---------- .../datasets/tasks/subscription_service.py | 3 +- 5 files changed, 40 insertions(+), 39 deletions(-) rename backend/dataall/{db/api => modules/datasets/services}/dataset_location.py (78%) diff --git a/backend/dataall/aws/handlers/s3.py b/backend/dataall/aws/handlers/s3.py index bcd0ad440..0be215ae3 100755 --- a/backend/dataall/aws/handlers/s3.py +++ b/backend/dataall/aws/handlers/s3.py @@ -4,6 +4,7 @@ from ...db import models from .service_handlers import Worker from .sts import SessionHelper +from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService log = logging.getLogger(__name__) @@ -13,7 +14,7 @@ class S3: @Worker.handler(path='s3.prefix.create') def create_dataset_location(engine, task: models.Task): with engine.scoped_session() as session: - location = db.api.DatasetStorageLocation.get_location_by_uri( + location = DatasetStorageLocationService.get_location_by_uri( session, task.targetUri ) S3.create_bucket_prefix(location) diff --git a/backend/dataall/db/api/__init__.py b/backend/dataall/db/api/__init__.py index a5f11d2c7..7bf8e0a4b 100644 --- a/backend/dataall/db/api/__init__.py +++ b/backend/dataall/db/api/__init__.py @@ -12,7 +12,6 @@ from .vote import Vote from .share_object import ShareObject, ShareObjectSM, ShareItemSM from .dataset import Dataset -from .dataset_location import DatasetStorageLocation from .notification import Notification from .redshift_cluster import RedshiftCluster from .vpc import Vpc diff --git a/backend/dataall/modules/datasets/api/storage_location/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py index 2eb18198c..e66e767a9 100644 --- a/backend/dataall/modules/datasets/api/storage_location/resolvers.py +++ b/backend/dataall/modules/datasets/api/storage_location/resolvers.py @@ -5,19 +5,19 @@ from dataall.db.api import ( ResourcePolicy, Glossary, - DatasetStorageLocation, Dataset, Environment, ) from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService def create_storage_location( context, source, datasetUri: str = None, input: dict = None ): with context.engine.scoped_session() as session: - location = DatasetStorageLocation.create_dataset_location( + location = DatasetStorageLocationService.create_dataset_location( session=session, username=context.username, groups=context.groups, @@ -40,15 +40,15 @@ def list_dataset_locations(context, source, filter: dict = None): if not filter: filter = {} with context.engine.scoped_session() as session: - return DatasetStorageLocation.list_dataset_locations( + return DatasetStorageLocationService.list_dataset_locations( session=session, uri=source.datasetUri, data=filter, check_perm=True ) def get_storage_location(context, source, locationUri=None): with context.engine.scoped_session() as session: - location = DatasetStorageLocation.get_location_by_uri(session, locationUri) - return DatasetStorageLocation.get_dataset_location( + location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) + return DatasetStorageLocationService.get_dataset_location( session=session, username=context.username, groups=context.groups, @@ -62,10 +62,10 @@ def update_storage_location( context, source, locationUri: str = None, input: dict = None ): with context.engine.scoped_session() as session: - location = DatasetStorageLocation.get_location_by_uri(session, locationUri) + location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) input['location'] = location input['locationUri'] = location.locationUri - DatasetStorageLocation.update_dataset_location( + DatasetStorageLocationService.update_dataset_location( session=session, username=context.username, groups=context.groups, @@ -80,8 +80,8 @@ def update_storage_location( def remove_storage_location(context, source, locationUri: str = None): with context.engine.scoped_session() as session: - location = DatasetStorageLocation.get_location_by_uri(session, locationUri) - DatasetStorageLocation.delete_dataset_location( + location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) + DatasetStorageLocationService.delete_dataset_location( session=session, username=context.username, groups=context.groups, @@ -103,7 +103,7 @@ def resolve_dataset(context, source: DatasetStorageLocation, **kwargs): def publish_location_update(context: Context, source, locationUri: str = None): with context.engine.scoped_session() as session: - location = DatasetStorageLocation.get_location_by_uri(session, locationUri) + location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) ResourcePolicy.check_user_resource_permission( session=session, username=context.username, diff --git a/backend/dataall/db/api/dataset_location.py b/backend/dataall/modules/datasets/services/dataset_location.py similarity index 78% rename from backend/dataall/db/api/dataset_location.py rename to backend/dataall/modules/datasets/services/dataset_location.py index e19f1dfb0..640f0a037 100644 --- a/backend/dataall/db/api/dataset_location.py +++ b/backend/dataall/modules/datasets/services/dataset_location.py @@ -3,15 +3,15 @@ from sqlalchemy import and_, or_ -from . import has_tenant_perm, has_resource_perm, Glossary -from .. import models, api, paginate, permissions, exceptions -from .dataset import Dataset -from dataall.modules.datasets.db.models import DatasetStorageLocation as DatasetStorageLocationModel +from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary +from dataall.db import models, api, paginate, permissions, exceptions +from dataall.db.api.dataset import Dataset +from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) -class DatasetStorageLocation: +class DatasetStorageLocationService: @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @has_resource_perm(permissions.CREATE_DATASET_FOLDER) @@ -22,14 +22,14 @@ def create_dataset_location( uri: str, data: dict = None, check_perm: bool = False, - ) -> DatasetStorageLocationModel: + ) -> DatasetStorageLocation: dataset = Dataset.get_dataset_by_uri(session, uri) exists = ( - session.query(DatasetStorageLocationModel) + session.query(DatasetStorageLocation) .filter( and_( - DatasetStorageLocationModel.datasetUri == dataset.datasetUri, - DatasetStorageLocationModel.S3Prefix == data['prefix'], + DatasetStorageLocation.datasetUri == dataset.datasetUri, + DatasetStorageLocation.S3Prefix == data['prefix'], ) ) .count() @@ -41,7 +41,7 @@ def create_dataset_location( message=f'Folder: {data["prefix"]} already exist on dataset {uri}', ) - location = DatasetStorageLocationModel( + location = DatasetStorageLocation( datasetUri=dataset.datasetUri, label=data.get('label'), description=data.get('description', 'No description provided'), @@ -78,14 +78,14 @@ def list_dataset_locations( check_perm: bool = False, ) -> dict: query = ( - session.query(DatasetStorageLocationModel) - .filter(DatasetStorageLocationModel.datasetUri == uri) - .order_by(DatasetStorageLocationModel.created.desc()) + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == uri) + .order_by(DatasetStorageLocation.created.desc()) ) if data.get('term'): term = data.get('term') query = query.filter( - DatasetStorageLocationModel.label.ilike('%' + term + '%') + DatasetStorageLocation.label.ilike('%' + term + '%') ) return paginate( query, page=data.get('page', 1), page_size=data.get('pageSize', 10) @@ -101,8 +101,8 @@ def get_dataset_location( uri: str, data: dict = None, check_perm: bool = False, - ) -> DatasetStorageLocationModel: - return DatasetStorageLocation.get_location_by_uri(session, data['locationUri']) + ) -> DatasetStorageLocation: + return DatasetStorageLocationService.get_location_by_uri(session, data['locationUri']) @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @@ -114,11 +114,11 @@ def update_dataset_location( uri: str, data: dict = None, check_perm: bool = False, - ) -> DatasetStorageLocationModel: + ) -> DatasetStorageLocation: location = data.get( 'location', - DatasetStorageLocation.get_location_by_uri(session, data['locationUri']), + DatasetStorageLocationService.get_location_by_uri(session, data['locationUri']), ) for k in data.keys(): @@ -145,7 +145,7 @@ def delete_dataset_location( data: dict = None, check_perm: bool = False, ): - location = DatasetStorageLocation.get_location_by_uri( + location = DatasetStorageLocationService.get_location_by_uri( session, data['locationUri'] ) share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() @@ -177,9 +177,9 @@ def delete_dataset_location( return True @staticmethod - def get_location_by_uri(session, location_uri) -> DatasetStorageLocationModel: + def get_location_by_uri(session, location_uri) -> DatasetStorageLocation: location: DatasetStorageLocation = session.query( - DatasetStorageLocationModel + DatasetStorageLocation ).get(location_uri) if not location: raise exceptions.ObjectNotFound('Folder', location_uri) @@ -187,13 +187,13 @@ def get_location_by_uri(session, location_uri) -> DatasetStorageLocationModel: @staticmethod def get_location_by_s3_prefix(session, s3_prefix, accountid, region): - location: DatasetStorageLocationModel = ( - session.query(DatasetStorageLocationModel) + location: DatasetStorageLocation = ( + session.query(DatasetStorageLocation) .filter( and_( - DatasetStorageLocationModel.S3Prefix.startswith(s3_prefix), - DatasetStorageLocationModel.AWSAccountId == accountid, - DatasetStorageLocationModel.region == region, + DatasetStorageLocation.S3Prefix.startswith(s3_prefix), + DatasetStorageLocation.AWSAccountId == accountid, + DatasetStorageLocation.region == region, ) ) .first() diff --git a/backend/dataall/modules/datasets/tasks/subscription_service.py b/backend/dataall/modules/datasets/tasks/subscription_service.py index 7df028914..94339d0f7 100644 --- a/backend/dataall/modules/datasets/tasks/subscription_service.py +++ b/backend/dataall/modules/datasets/tasks/subscription_service.py @@ -16,6 +16,7 @@ from dataall.tasks.subscriptions import poll_queues from dataall.utils import json_utils from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService from dataall.modules.datasets.db.models import DatasetStorageLocation root = logging.getLogger() @@ -105,7 +106,7 @@ def publish_table_update_message(engine, message): @staticmethod def publish_location_update_message(session, message): location: DatasetStorageLocation = ( - db.api.DatasetStorageLocation.get_location_by_s3_prefix( + DatasetStorageLocationService.get_location_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), From 66b5ddbf8093f4c58f6de46552064dff3db57bf0 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 11:58:32 +0200 Subject: [PATCH 43/67] Datasets refactoring Extracted s3_location_handler --- backend/dataall/aws/handlers/s3.py | 35 -------------- .../modules/datasets/handlers/__init__.py | 6 ++- .../datasets/handlers/s3_location_handler.py | 48 +++++++++++++++++++ 3 files changed, 52 insertions(+), 37 deletions(-) create mode 100644 backend/dataall/modules/datasets/handlers/s3_location_handler.py diff --git a/backend/dataall/aws/handlers/s3.py b/backend/dataall/aws/handlers/s3.py index 0be215ae3..2352ef791 100755 --- a/backend/dataall/aws/handlers/s3.py +++ b/backend/dataall/aws/handlers/s3.py @@ -1,51 +1,16 @@ import logging -from ... import db -from ...db import models -from .service_handlers import Worker from .sts import SessionHelper -from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService log = logging.getLogger(__name__) class S3: - @staticmethod - @Worker.handler(path='s3.prefix.create') - def create_dataset_location(engine, task: models.Task): - with engine.scoped_session() as session: - location = DatasetStorageLocationService.get_location_by_uri( - session, task.targetUri - ) - S3.create_bucket_prefix(location) - return location - @staticmethod def client(account_id: str, region: str, client_type: str): session = SessionHelper.remote_session(accountid=account_id) return session.client(client_type, region_name=region) - @staticmethod - def create_bucket_prefix(location): - try: - accountid = location.AWSAccountId - region = location.region - s3cli = S3.client(account_id=accountid, region=region, client_type='s3') - response = s3cli.put_object( - Bucket=location.S3BucketName, Body='', Key=location.S3Prefix + '/' - ) - log.info( - 'Creating S3 Prefix `{}`({}) on AWS #{}'.format( - location.S3BucketName, accountid, response - ) - ) - location.locationCreated = True - except Exception as e: - log.error( - f'Dataset storage location creation failed on S3 for dataset location {location.locationUri} : {e}' - ) - raise e - @staticmethod def create_bucket_policy(account_id: str, region: str, bucket_name: str, policy: str): try: diff --git a/backend/dataall/modules/datasets/handlers/__init__.py b/backend/dataall/modules/datasets/handlers/__init__.py index a5d506712..382f052a9 100644 --- a/backend/dataall/modules/datasets/handlers/__init__.py +++ b/backend/dataall/modules/datasets/handlers/__init__.py @@ -4,7 +4,9 @@ """ from dataall.modules.datasets.handlers import ( glue_column_handler, - glue_table_handler + glue_table_handler, + glue_profiling_handler, + s3_location_handler ) -__all__ = ["glue_column_handler", "glue_table_handler"] +__all__ = ["glue_column_handler", "glue_table_handler", "glue_profiling_handler", "s3_location_handler"] diff --git a/backend/dataall/modules/datasets/handlers/s3_location_handler.py b/backend/dataall/modules/datasets/handlers/s3_location_handler.py new file mode 100644 index 000000000..431a4cecd --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/s3_location_handler.py @@ -0,0 +1,48 @@ +import logging + +from dataall.aws.handlers.service_handlers import Worker +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import models +from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService + +log = logging.getLogger(__name__) + + +class S3DatasetLocationHandler: + """Handles async requests related to s3 for dataset storage location""" + + @staticmethod + def client(account_id: str, region: str, client_type: str): + session = SessionHelper.remote_session(accountid=account_id) + return session.client(client_type, region_name=region) + + @staticmethod + @Worker.handler(path='s3.prefix.create') + def create_dataset_location(engine, task: models.Task): + with engine.scoped_session() as session: + location = DatasetStorageLocationService.get_location_by_uri( + session, task.targetUri + ) + S3DatasetLocationHandler.create_bucket_prefix(location) + return location + + @staticmethod + def create_bucket_prefix(location): + try: + account_id = location.AWSAccountId + region = location.region + s3cli = S3DatasetLocationHandler.client(account_id=account_id, region=region, client_type='s3') + response = s3cli.put_object( + Bucket=location.S3BucketName, Body='', Key=location.S3Prefix + '/' + ) + log.info( + 'Creating S3 Prefix `{}`({}) on AWS #{}'.format( + location.S3BucketName, account_id, response + ) + ) + location.locationCreated = True + except Exception as e: + log.error( + f'Dataset storage location creation failed on S3 for dataset location {location.locationUri} : {e}' + ) + raise e From 352d82485aa4edbcfcaa972f3603ee2bdb0e0d96 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 12:12:58 +0200 Subject: [PATCH 44/67] Datasets refactoring Moved the dataset stack into modules --- backend/dataall/cdkproxy/stacks/__init__.py | 2 -- backend/dataall/modules/datasets/__init__.py | 12 +++++++++++ .../dataall/modules/datasets/cdk/__init__.py | 3 +++ .../datasets/cdk/dataset_stack.py} | 20 +++++++++---------- tests/cdkproxy/test_dataset_stack.py | 8 ++++---- 5 files changed, 29 insertions(+), 16 deletions(-) create mode 100644 backend/dataall/modules/datasets/cdk/__init__.py rename backend/dataall/{cdkproxy/stacks/dataset.py => modules/datasets/cdk/dataset_stack.py} (97%) diff --git a/backend/dataall/cdkproxy/stacks/__init__.py b/backend/dataall/cdkproxy/stacks/__init__.py index 3857b30c0..fb4674754 100644 --- a/backend/dataall/cdkproxy/stacks/__init__.py +++ b/backend/dataall/cdkproxy/stacks/__init__.py @@ -1,4 +1,3 @@ -from .dataset import Dataset from .environment import EnvironmentSetup from .pipeline import PipelineStack from .manager import stack, instanciate_stack, StackManager @@ -7,7 +6,6 @@ __all__ = [ 'EnvironmentSetup', - 'Dataset', 'StackManager', 'stack', 'StackManager', diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 842eba82b..f0bac92d2 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -39,3 +39,15 @@ def is_supported(cls, modes: List[ImportMode]): def __init__(self): import dataall.modules.datasets.handlers log.info("Dataset handlers have been imported") + + +class DatasetCdkModuleInterface(ModuleInterface): + """Loads dataset cdk stacks """ + + @classmethod + def is_supported(cls, modes: List[ImportMode]): + return ImportMode.CDK in modes + + def __init__(self): + import dataall.modules.datasets.cdk + log.info("Dataset stacks have been imported") diff --git a/backend/dataall/modules/datasets/cdk/__init__.py b/backend/dataall/modules/datasets/cdk/__init__.py new file mode 100644 index 000000000..5642d8a40 --- /dev/null +++ b/backend/dataall/modules/datasets/cdk/__init__.py @@ -0,0 +1,3 @@ +from dataall.modules.datasets.cdk import dataset_stack + +__all__ = ["dataset_stack"] diff --git a/backend/dataall/cdkproxy/stacks/dataset.py b/backend/dataall/modules/datasets/cdk/dataset_stack.py similarity index 97% rename from backend/dataall/cdkproxy/stacks/dataset.py rename to backend/dataall/modules/datasets/cdk/dataset_stack.py index 852cba66b..e99b43b0c 100644 --- a/backend/dataall/cdkproxy/stacks/dataset.py +++ b/backend/dataall/modules/datasets/cdk/dataset_stack.py @@ -19,22 +19,22 @@ from aws_cdk.aws_glue import CfnCrawler from sqlalchemy import and_, or_ -from .manager import stack -from ... import db -from ...aws.handlers.quicksight import Quicksight -from ...aws.handlers.lakeformation import LakeFormation -from ...aws.handlers.sts import SessionHelper -from ...db import models -from ...db.api import Environment -from ...utils.cdk_nag_utils import CDKNagUtil -from ...utils.runtime_stacks_tagging import TagsUtil +from dataall.cdkproxy.stacks.manager import stack +from dataall import db +from dataall.aws.handlers.quicksight import Quicksight +from dataall.aws.handlers.lakeformation import LakeFormation +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import models +from dataall.db.api import Environment +from dataall.utils.cdk_nag_utils import CDKNagUtil +from dataall.utils.runtime_stacks_tagging import TagsUtil from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @stack(stack='dataset') -class Dataset(Stack): +class DatasetStack(Stack): module_name = __file__ def get_engine(self) -> db.Engine: diff --git a/tests/cdkproxy/test_dataset_stack.py b/tests/cdkproxy/test_dataset_stack.py index 19a30d513..34f495056 100644 --- a/tests/cdkproxy/test_dataset_stack.py +++ b/tests/cdkproxy/test_dataset_stack.py @@ -3,14 +3,14 @@ import pytest from aws_cdk import App -from dataall.cdkproxy.stacks import Dataset +from dataall.modules.datasets.cdk.dataset_stack import DatasetStack @pytest.fixture(scope='function', autouse=True) def patch_methods(mocker, db, dataset, env, org): - mocker.patch('dataall.cdkproxy.stacks.dataset.Dataset.get_engine', return_value=db) + mocker.patch('dataall.cdkproxy.stacks.dataset.DatasetStack.get_engine', return_value=db) mocker.patch( - 'dataall.cdkproxy.stacks.dataset.Dataset.get_target', return_value=dataset + 'dataall.cdkproxy.stacks.dataset.DatasetStack.get_target', return_value=dataset ) mocker.patch( 'dataall.aws.handlers.sts.SessionHelper.get_delegation_role_name', @@ -41,7 +41,7 @@ def patch_methods(mocker, db, dataset, env, org): @pytest.fixture(scope='function', autouse=True) def template(dataset): app = App() - Dataset(app, 'Dataset', target_uri=dataset.datasetUri) + DatasetStack(app, 'Dataset', target_uri=dataset.datasetUri) return json.dumps(app.synth().get_stack_by_name('Dataset').template) From 9934a9cf81a52ce95287965c12c25397dad329cd Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Mon, 17 Apr 2023 13:42:22 +0200 Subject: [PATCH 45/67] Datasets refactoring Moved indexing into GlossaryRegistry --- .../dataall/api/Objects/Glossary/registry.py | 39 +++++++++++++++---- .../dataall/api/Objects/Glossary/resolvers.py | 16 +------- backend/dataall/modules/datasets/__init__.py | 8 +++- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/backend/dataall/api/Objects/Glossary/registry.py b/backend/dataall/api/Objects/Glossary/registry.py index 7c42e4f4c..0f0cdb61f 100644 --- a/backend/dataall/api/Objects/Glossary/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -1,9 +1,12 @@ -from dataclasses import dataclass -from typing import Type, Dict, Optional, Protocol, Union +from dataclasses import dataclass, field +from typing import Type, Dict, Optional, Protocol, Union, Callable, Any + +from opensearchpy import OpenSearch from dataall.api import gql from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models +from dataall.searchproxy.indexers import upsert_dashboard, upsert_table, upsert_dataset class Identifiable(Protocol): @@ -17,13 +20,14 @@ class GlossaryDefinition: target_type: str object_type: str model: Union[Type[Resource], Identifiable] # should be an intersection, but python typing doesn't have one yet + reindexer: Callable[[Any, OpenSearch, str], None] = None # a callback to reindex glossaries in open search def target_uri(self): return self.model.uri() class GlossaryRegistry(UnionTypeRegistry): - """Registry of glossary definition and API to retrieve data""" + """Registry of glossary definition and API to retrieve and reindex data""" _DEFINITIONS: Dict[str, GlossaryDefinition] = {} @classmethod @@ -50,8 +54,29 @@ def definitions(cls): def types(cls): return [gql.Ref(definition.object_type) for definition in cls._DEFINITIONS.values()] + @classmethod + def reindex(cls, session, es: OpenSearch, target_type: str, target_uri: str): + definition = cls._DEFINITIONS[target_type] + if definition.reindexer: + definition.reindexer(session, es, target_uri) + + +GlossaryRegistry.register(GlossaryDefinition( + target_type="Dashboard", + object_type="Dashboard", + model=models.Dashboard, + reindexer=upsert_dashboard +)) -GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) -GlossaryRegistry.register(GlossaryDefinition("Dashboard", "Dashboard", models.Dashboard)) -GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) -GlossaryRegistry.register(GlossaryDefinition("Dataset", "Dataset", models.Dataset)) +GlossaryRegistry.register(GlossaryDefinition( + target_type="DatasetTable", + object_type="DatasetTable", + model=models.DatasetTable, + reindexer=upsert_table +)) +GlossaryRegistry.register(GlossaryDefinition( + target_type="Dataset", + object_type="Dataset", + model=models.Dataset, + reindexer=upsert_dataset +)) diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index 959578600..fdc4c3eea 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -6,15 +6,10 @@ from .... import db from ....api.context import Context from ....db import paginate, exceptions, models -from ....searchproxy import upsert_dataset -from ....searchproxy import upsert_table -from ....searchproxy.indexers import upsert_folder, upsert_dashboard from ....api.constants import ( GlossaryRole ) -from dataall.modules.datasets.db.models import DatasetStorageLocation - def resolve_glossary_node(obj: models.GlossaryNode, *_): if obj.nodeType == 'G': @@ -462,15 +457,8 @@ def reindex(context, linkUri): link: models.TermLink = session.query(models.TermLink).get(linkUri) if not link: return - target = resolve_link_target(context, source=link) - if isinstance(target, models.Dataset): - upsert_dataset(session=session, es=context.es, datasetUri=link.targetUri) - elif isinstance(target, models.DatasetTable): - upsert_table(session=session, es=context.es, tableUri=link.targetUri) - elif isinstance(target, DatasetStorageLocation): - upsert_folder(session=session, es=context.es, locationUri=link.targetUri) - elif isinstance(target, models.Dashboard): - upsert_dashboard(session=session, es=context.es, dashboardUri=link.targetUri) + + GlossaryRegistry.reindex(session, context.es, link.targetType, link.targetUri) def _target_model(target_type: str): diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index f0bac92d2..3e50f37fa 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -6,6 +6,7 @@ from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation from dataall.modules.loader import ModuleInterface, ImportMode +from dataall.searchproxy.indexers import upsert_folder log = logging.getLogger(__name__) @@ -24,7 +25,12 @@ def __init__(self): FeedRegistry.register(FeedDefinition("DatasetStorageLocation", DatasetStorageLocation)) GlossaryRegistry.register(GlossaryDefinition("Column", "DatasetTableColumn", DatasetTableColumn)) - GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", DatasetStorageLocation)) + GlossaryRegistry.register(GlossaryDefinition( + target_type="Folder", + object_type="DatasetStorageLocation", + model=DatasetStorageLocation, + reindexer=upsert_folder + )) log.info("API of datasets has been imported") From 228c1753039667181edcd97af3a50a0915fd396e Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 18 Apr 2023 14:25:34 +0200 Subject: [PATCH 46/67] Datasets refactoring Removed dead code --- backend/dataall/db/api/environment.py | 92 --------------------------- 1 file changed, 92 deletions(-) diff --git a/backend/dataall/db/api/environment.py b/backend/dataall/db/api/environment.py index ac3777e5d..8642287db 100644 --- a/backend/dataall/db/api/environment.py +++ b/backend/dataall/db/api/environment.py @@ -1089,98 +1089,6 @@ def paginated_environment_networks( page_size=data.get('pageSize', 10), ).to_dict() - @staticmethod - @has_resource_perm(permissions.LIST_ENVIRONMENT_DATASETS) - def paginated_environment_data_items( - session, username, groups, uri, data=None, check_perm=None - ): - share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() - q = ( - session.query( - models.ShareObjectItem.shareUri.label('shareUri'), - models.Dataset.datasetUri.label('datasetUri'), - models.Dataset.name.label('datasetName'), - models.Dataset.description.label('datasetDescription'), - models.Environment.environmentUri.label('environmentUri'), - models.Environment.name.label('environmentName'), - models.ShareObject.created.label('created'), - models.ShareObjectItem.itemType.label('itemType'), - models.ShareObjectItem.GlueDatabaseName.label('GlueDatabaseName'), - models.ShareObjectItem.GlueTableName.label('GlueTableName'), - models.ShareObjectItem.S3AccessPointName.label('S3AccessPointName'), - models.Organization.organizationUri.label('organizationUri'), - models.Organization.name.label('organizationName'), - case( - [ - ( - models.ShareObjectItem.itemType - == ShareableType.Table.value, - func.concat( - models.DatasetTable.GlueDatabaseName, - '.', - models.DatasetTable.GlueTableName, - ), - ), - ( - models.ShareObjectItem.itemType - == ShareableType.StorageLocation.value, - func.concat(DatasetStorageLocation.name), - ), - ], - else_='XXX XXXX', - ).label('itemAccess'), - ) - .join( - models.ShareObject, - models.ShareObject.shareUri == models.ShareObjectItem.shareUri, - ) - .join( - models.Dataset, - models.ShareObject.datasetUri == models.Dataset.datasetUri, - ) - .join( - models.Environment, - models.Environment.environmentUri == models.Dataset.environmentUri, - ) - .join( - models.Organization, - models.Organization.organizationUri - == models.Environment.organizationUri, - ) - .outerjoin( - models.DatasetTable, - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, - ) - .outerjoin( - DatasetStorageLocation, - models.ShareObjectItem.itemUri - == DatasetStorageLocation.locationUri, - ) - .filter( - and_( - models.ShareObjectItem.status.in_(share_item_shared_states), - models.ShareObject.environmentUri == uri, - ) - ) - ) - - if data.get('datasetUri'): - datasetUri = data.get('datasetUri') - q = q.filter(models.ShareObject.datasetUri == datasetUri) - - if data.get('itemTypes', None): - itemTypes = data.get('itemTypes') - q = q.filter( - or_(*[models.ShareObjectItem.itemType == t for t in itemTypes]) - ) - if data.get('term'): - term = data.get('term') - q = q.filter(models.ShareObjectItem.itemName.ilike('%' + term + '%')) - - return paginate( - query=q, page=data.get('page', 1), page_size=data.get('pageSize', 10) - ).to_dict() - @staticmethod def validate_invite_params(data): if not data: From 263d10cd4bb3b1c83b5392fd6ee0b0138e59977b Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 19 Apr 2023 10:08:11 +0200 Subject: [PATCH 47/67] Datasets refactoring Extracted dataset share service --- .../api/Objects/Environment/resolvers.py | 4 +- .../dataall/api/Objects/Group/resolvers.py | 4 +- backend/dataall/db/api/environment.py | 194 +---------------- .../services/dataset_share_service.py | 204 ++++++++++++++++++ 4 files changed, 211 insertions(+), 195 deletions(-) create mode 100644 backend/dataall/modules/datasets/services/dataset_share_service.py diff --git a/backend/dataall/api/Objects/Environment/resolvers.py b/backend/dataall/api/Objects/Environment/resolvers.py index 60af060a7..86f251f59 100644 --- a/backend/dataall/api/Objects/Environment/resolvers.py +++ b/backend/dataall/api/Objects/Environment/resolvers.py @@ -21,6 +21,8 @@ NamingConventionPattern, ) +from dataall.modules.datasets.services.dataset_share_service import DatasetShareService + log = logging.getLogger() @@ -391,7 +393,7 @@ def list_shared_with_environment_data_items( if not filter: filter = {} with context.engine.scoped_session() as session: - return db.api.Environment.paginated_shared_with_environment_datasets( + return DatasetShareService.paginated_shared_with_environment_datasets( session=session, username=context.username, groups=context.groups, diff --git a/backend/dataall/api/Objects/Group/resolvers.py b/backend/dataall/api/Objects/Group/resolvers.py index 9192b6b59..11de0da1b 100644 --- a/backend/dataall/api/Objects/Group/resolvers.py +++ b/backend/dataall/api/Objects/Group/resolvers.py @@ -4,7 +4,7 @@ from ....db import exceptions from ....db.models import Group from ....aws.handlers.cognito import Cognito - +from ....modules.datasets.services.dataset_share_service import DatasetShareService log = logging.getLogger() @@ -66,7 +66,7 @@ def list_data_items_shared_with_env_group( if not filter: filter = {} with context.engine.scoped_session() as session: - return db.api.Environment.paginated_shared_with_environment_group_datasets( + return DatasetShareService.paginated_shared_with_environment_group_datasets( session=session, username=context.username, groups=context.groups, diff --git a/backend/dataall/db/api/environment.py b/backend/dataall/db/api/environment.py index 8642287db..d1c7a67fa 100644 --- a/backend/dataall/db/api/environment.py +++ b/backend/dataall/db/api/environment.py @@ -1,11 +1,11 @@ import logging import re -from sqlalchemy import or_, case, func +from sqlalchemy import or_ from sqlalchemy.orm import Query from sqlalchemy.sql import and_ -from .. import exceptions, permissions, models, api +from .. import exceptions, permissions, models from . import ( has_resource_perm, has_tenant_perm, @@ -16,7 +16,6 @@ from ..api.organization import Organization from ..models import EnvironmentGroup from ..models.Enums import ( - ShareableType, EnvironmentType, EnvironmentPermission, ) @@ -29,8 +28,6 @@ NamingConventionPattern, ) -from dataall.modules.datasets.db.models import DatasetStorageLocation - log = logging.getLogger(__name__) @@ -871,195 +868,8 @@ def paginated_environment_group_datasets( page_size=data.get('pageSize', 10), ).to_dict() - @staticmethod - @has_resource_perm(permissions.LIST_ENVIRONMENT_SHARED_WITH_OBJECTS) - def paginated_shared_with_environment_datasets( - session, username, groups, uri, data=None, check_perm=None - ) -> dict: - share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() - q = ( - session.query( - models.ShareObjectItem.shareUri.label('shareUri'), - models.Dataset.datasetUri.label('datasetUri'), - models.Dataset.name.label('datasetName'), - models.Dataset.description.label('datasetDescription'), - models.Environment.environmentUri.label('environmentUri'), - models.Environment.name.label('environmentName'), - models.ShareObject.created.label('created'), - models.ShareObject.principalId.label('principalId'), - models.ShareObjectItem.itemType.label('itemType'), - models.ShareObjectItem.GlueDatabaseName.label('GlueDatabaseName'), - models.ShareObjectItem.GlueTableName.label('GlueTableName'), - models.ShareObjectItem.S3AccessPointName.label('S3AccessPointName'), - models.Organization.organizationUri.label('organizationUri'), - models.Organization.name.label('organizationName'), - case( - [ - ( - models.ShareObjectItem.itemType - == ShareableType.Table.value, - func.concat( - models.DatasetTable.GlueDatabaseName, - '.', - models.DatasetTable.GlueTableName, - ), - ), - ( - models.ShareObjectItem.itemType - == ShareableType.StorageLocation.value, - func.concat(DatasetStorageLocation.name), - ), - ], - else_='XXX XXXX', - ).label('itemAccess'), - ) - .join( - models.ShareObject, - models.ShareObject.shareUri == models.ShareObjectItem.shareUri, - ) - .join( - models.Dataset, - models.ShareObject.datasetUri == models.Dataset.datasetUri, - ) - .join( - models.Environment, - models.Environment.environmentUri == models.Dataset.environmentUri, - ) - .join( - models.Organization, - models.Organization.organizationUri - == models.Environment.organizationUri, - ) - .outerjoin( - models.DatasetTable, - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, - ) - .outerjoin( - DatasetStorageLocation, - models.ShareObjectItem.itemUri - == DatasetStorageLocation.locationUri, - ) - .filter( - and_( - models.ShareObjectItem.status.in_(share_item_shared_states), - models.ShareObject.environmentUri == uri, - ) - ) - ) - - if data.get('datasetUri'): - datasetUri = data.get('datasetUri') - q = q.filter(models.ShareObject.datasetUri == datasetUri) - - if data.get('itemTypes', None): - itemTypes = data.get('itemTypes') - q = q.filter( - or_(*[models.ShareObjectItem.itemType == t for t in itemTypes]) - ) - - if data.get("uniqueDatasets", False): - q = q.distinct(models.ShareObject.datasetUri) - - if data.get('term'): - term = data.get('term') - q = q.filter(models.ShareObjectItem.itemName.ilike('%' + term + '%')) - - return paginate( - query=q, page=data.get('page', 1), page_size=data.get('pageSize', 10) - ).to_dict() - - @staticmethod - def paginated_shared_with_environment_group_datasets( - session, username, groups, envUri, groupUri, data=None, check_perm=None - ) -> dict: - share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() - q = ( - session.query( - models.ShareObjectItem.shareUri.label('shareUri'), - models.Dataset.datasetUri.label('datasetUri'), - models.Dataset.name.label('datasetName'), - models.Dataset.description.label('datasetDescription'), - models.Environment.environmentUri.label('environmentUri'), - models.Environment.name.label('environmentName'), - models.ShareObject.created.label('created'), - models.ShareObject.principalId.label('principalId'), - models.ShareObjectItem.itemType.label('itemType'), - models.ShareObjectItem.GlueDatabaseName.label('GlueDatabaseName'), - models.ShareObjectItem.GlueTableName.label('GlueTableName'), - models.ShareObjectItem.S3AccessPointName.label('S3AccessPointName'), - models.Organization.organizationUri.label('organizationUri'), - models.Organization.name.label('organizationName'), - case( - [ - ( - models.ShareObjectItem.itemType - == ShareableType.Table.value, - func.concat( - models.DatasetTable.GlueDatabaseName, - '.', - models.DatasetTable.GlueTableName, - ), - ), - ( - models.ShareObjectItem.itemType - == ShareableType.StorageLocation.value, - func.concat(DatasetStorageLocation.name), - ), - ], - else_='XXX XXXX', - ).label('itemAccess'), - ) - .join( - models.ShareObject, - models.ShareObject.shareUri == models.ShareObjectItem.shareUri, - ) - .join( - models.Dataset, - models.ShareObject.datasetUri == models.Dataset.datasetUri, - ) - .join( - models.Environment, - models.Environment.environmentUri == models.Dataset.environmentUri, - ) - .join( - models.Organization, - models.Organization.organizationUri - == models.Environment.organizationUri, - ) - .outerjoin( - models.DatasetTable, - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, - ) - .outerjoin( - DatasetStorageLocation, - models.ShareObjectItem.itemUri - == DatasetStorageLocation.locationUri, - ) - .filter( - and_( - models.ShareObjectItem.status.in_(share_item_shared_states), - models.ShareObject.environmentUri == envUri, - models.ShareObject.principalId == groupUri, - ) - ) - ) - if data.get('datasetUri'): - datasetUri = data.get('datasetUri') - q = q.filter(models.ShareObject.datasetUri == datasetUri) - if data.get('itemTypes', None): - itemTypes = data.get('itemTypes') - q = q.filter( - or_(*[models.ShareObjectItem.itemType == t for t in itemTypes]) - ) - if data.get('term'): - term = data.get('term') - q = q.filter(models.ShareObjectItem.itemName.ilike('%' + term + '%')) - - return paginate( - query=q, page=data.get('page', 1), page_size=data.get('pageSize', 10) - ).to_dict() @staticmethod def query_environment_networks(session, username, groups, uri, filter) -> Query: diff --git a/backend/dataall/modules/datasets/services/dataset_share_service.py b/backend/dataall/modules/datasets/services/dataset_share_service.py new file mode 100644 index 000000000..9ca84a1cf --- /dev/null +++ b/backend/dataall/modules/datasets/services/dataset_share_service.py @@ -0,0 +1,204 @@ +import logging +import re + +from sqlalchemy import or_, case, func +from sqlalchemy.sql import and_ + +from dataall.api.constants import ShareableType +from dataall.db import models, permissions +from dataall.db.api import has_resource_perm, ShareItemSM +from dataall.db.paginator import paginate +from dataall.modules.datasets.db.models import DatasetStorageLocation + + +class DatasetShareService: + + @staticmethod + @has_resource_perm(permissions.LIST_ENVIRONMENT_SHARED_WITH_OBJECTS) + def paginated_shared_with_environment_datasets( + session, username, groups, uri, data=None, check_perm=None + ) -> dict: + share_item_shared_states = ShareItemSM.get_share_item_shared_states() + q = ( + session.query( + models.ShareObjectItem.shareUri.label('shareUri'), + models.Dataset.datasetUri.label('datasetUri'), + models.Dataset.name.label('datasetName'), + models.Dataset.description.label('datasetDescription'), + models.Environment.environmentUri.label('environmentUri'), + models.Environment.name.label('environmentName'), + models.ShareObject.created.label('created'), + models.ShareObject.principalId.label('principalId'), + models.ShareObjectItem.itemType.label('itemType'), + models.ShareObjectItem.GlueDatabaseName.label('GlueDatabaseName'), + models.ShareObjectItem.GlueTableName.label('GlueTableName'), + models.ShareObjectItem.S3AccessPointName.label('S3AccessPointName'), + models.Organization.organizationUri.label('organizationUri'), + models.Organization.name.label('organizationName'), + case( + [ + ( + models.ShareObjectItem.itemType + == ShareableType.Table.value, + func.concat( + models.DatasetTable.GlueDatabaseName, + '.', + models.DatasetTable.GlueTableName, + ), + ), + ( + models.ShareObjectItem.itemType + == ShareableType.StorageLocation.value, + func.concat(DatasetStorageLocation.name), + ), + ], + else_='XXX XXXX', + ).label('itemAccess'), + ) + .join( + models.ShareObject, + models.ShareObject.shareUri == models.ShareObjectItem.shareUri, + ) + .join( + models.Dataset, + models.ShareObject.datasetUri == models.Dataset.datasetUri, + ) + .join( + models.Environment, + models.Environment.environmentUri == models.Dataset.environmentUri, + ) + .join( + models.Organization, + models.Organization.organizationUri + == models.Environment.organizationUri, + ) + .outerjoin( + models.DatasetTable, + models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, + ) + .outerjoin( + DatasetStorageLocation, + models.ShareObjectItem.itemUri + == DatasetStorageLocation.locationUri, + ) + .filter( + and_( + models.ShareObjectItem.status.in_(share_item_shared_states), + models.ShareObject.environmentUri == uri, + ) + ) + ) + + if data.get('datasetUri'): + datasetUri = data.get('datasetUri') + q = q.filter(models.ShareObject.datasetUri == datasetUri) + + if data.get('itemTypes', None): + itemTypes = data.get('itemTypes') + q = q.filter( + or_(*[models.ShareObjectItem.itemType == t for t in itemTypes]) + ) + + if data.get("uniqueDatasets", False): + q = q.distinct(models.ShareObject.datasetUri) + + if data.get('term'): + term = data.get('term') + q = q.filter(models.ShareObjectItem.itemName.ilike('%' + term + '%')) + + return paginate( + query=q, page=data.get('page', 1), page_size=data.get('pageSize', 10) + ).to_dict() + + @staticmethod + def paginated_shared_with_environment_group_datasets( + session, username, groups, envUri, groupUri, data=None, check_perm=None + ) -> dict: + share_item_shared_states = ShareItemSM.get_share_item_shared_states() + q = ( + session.query( + models.ShareObjectItem.shareUri.label('shareUri'), + models.Dataset.datasetUri.label('datasetUri'), + models.Dataset.name.label('datasetName'), + models.Dataset.description.label('datasetDescription'), + models.Environment.environmentUri.label('environmentUri'), + models.Environment.name.label('environmentName'), + models.ShareObject.created.label('created'), + models.ShareObject.principalId.label('principalId'), + models.ShareObjectItem.itemType.label('itemType'), + models.ShareObjectItem.GlueDatabaseName.label('GlueDatabaseName'), + models.ShareObjectItem.GlueTableName.label('GlueTableName'), + models.ShareObjectItem.S3AccessPointName.label('S3AccessPointName'), + models.Organization.organizationUri.label('organizationUri'), + models.Organization.name.label('organizationName'), + case( + [ + ( + models.ShareObjectItem.itemType + == ShareableType.Table.value, + func.concat( + models.DatasetTable.GlueDatabaseName, + '.', + models.DatasetTable.GlueTableName, + ), + ), + ( + models.ShareObjectItem.itemType + == ShareableType.StorageLocation.value, + func.concat(DatasetStorageLocation.name), + ), + ], + else_='XXX XXXX', + ).label('itemAccess'), + ) + .join( + models.ShareObject, + models.ShareObject.shareUri == models.ShareObjectItem.shareUri, + ) + .join( + models.Dataset, + models.ShareObject.datasetUri == models.Dataset.datasetUri, + ) + .join( + models.Environment, + models.Environment.environmentUri == models.Dataset.environmentUri, + ) + .join( + models.Organization, + models.Organization.organizationUri + == models.Environment.organizationUri, + ) + .outerjoin( + models.DatasetTable, + models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, + ) + .outerjoin( + DatasetStorageLocation, + models.ShareObjectItem.itemUri + == DatasetStorageLocation.locationUri, + ) + .filter( + and_( + models.ShareObjectItem.status.in_(share_item_shared_states), + models.ShareObject.environmentUri == envUri, + models.ShareObject.principalId == groupUri, + ) + ) + ) + + if data.get('datasetUri'): + datasetUri = data.get('datasetUri') + q = q.filter(models.ShareObject.datasetUri == datasetUri) + + if data.get('itemTypes', None): + itemTypes = data.get('itemTypes') + q = q.filter( + or_(*[models.ShareObjectItem.itemType == t for t in itemTypes]) + ) + if data.get('term'): + term = data.get('term') + q = q.filter(models.ShareObjectItem.itemName.ilike('%' + term + '%')) + + return paginate( + query=q, page=data.get('page', 1), page_size=data.get('pageSize', 10) + ).to_dict() From 417e6e50fcd120c16bc8207c76a5733e7005ab04 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 19 Apr 2023 11:21:46 +0200 Subject: [PATCH 48/67] Datasets refactoring Solved broken reference --- .../dataall/modules/datasets/api/storage_location/resolvers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/dataall/modules/datasets/api/storage_location/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py index e66e767a9..32cfcfcac 100644 --- a/backend/dataall/modules/datasets/api/storage_location/resolvers.py +++ b/backend/dataall/modules/datasets/api/storage_location/resolvers.py @@ -8,6 +8,7 @@ Dataset, Environment, ) +from dataall.modules.datasets.handlers.s3_location_handler import S3DatasetLocationHandler from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService @@ -26,7 +27,7 @@ def create_storage_location( check_perm=True, ) - S3.create_bucket_prefix(location) + S3DatasetLocationHandler.create_bucket_prefix(location) indexers.upsert_folder( session=session, es=context.es, locationUri=location.locationUri From 7aaff5b113182485b9b163a51c83f1234043c5a5 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 19 Apr 2023 16:30:09 +0200 Subject: [PATCH 49/67] Introduced Indexers --- .../api/Objects/Dashboard/resolvers.py | 5 +- .../dataall/api/Objects/Dataset/resolvers.py | 14 +- .../api/Objects/DatasetTable/resolvers.py | 5 +- .../dataall/api/Objects/Glossary/registry.py | 13 +- backend/dataall/api/Objects/Vote/resolvers.py | 7 +- backend/dataall/modules/datasets/__init__.py | 4 +- .../api/storage_location/resolvers.py | 7 +- backend/dataall/searchproxy/__init__.py | 5 - backend/dataall/searchproxy/indexers.py | 565 +++++++++--------- backend/dataall/searchproxy/upsert.py | 68 ++- backend/dataall/tasks/catalog_indexer.py | 8 +- tests/api/conftest.py | 9 +- tests/searchproxy/test_indexers.py | 15 +- tests/tasks/test_catalog_indexer.py | 2 +- 14 files changed, 380 insertions(+), 347 deletions(-) diff --git a/backend/dataall/api/Objects/Dashboard/resolvers.py b/backend/dataall/api/Objects/Dashboard/resolvers.py index 714b6c4b9..84a2a1bcc 100644 --- a/backend/dataall/api/Objects/Dashboard/resolvers.py +++ b/backend/dataall/api/Objects/Dashboard/resolvers.py @@ -8,6 +8,7 @@ from ....db.api import ResourcePolicy, Glossary, Vote from ....searchproxy import indexers from ....utils import Parameter +from dataall.searchproxy.indexers import DashboardIndexer param_store = Parameter() ENVNAME = os.getenv("envname", "local") @@ -146,7 +147,7 @@ def import_dashboard(context: Context, source, input: dict = None): check_perm=True, ) - indexers.upsert_dashboard(session, context.es, dashboard.dashboardUri) + DashboardIndexer.upsert(session, dashboard_uri=dashboard.dashboardUri) return dashboard @@ -166,7 +167,7 @@ def update_dashboard(context, source, input: dict = None): check_perm=True, ) - indexers.upsert_dashboard(session, context.es, dashboard.dashboardUri) + DashboardIndexer.upsert(session, dashboard_uri=dashboard.dashboardUri) return dashboard diff --git a/backend/dataall/api/Objects/Dataset/resolvers.py b/backend/dataall/api/Objects/Dataset/resolvers.py index a03b0647f..fcbc2d6f3 100644 --- a/backend/dataall/api/Objects/Dataset/resolvers.py +++ b/backend/dataall/api/Objects/Dataset/resolvers.py @@ -13,11 +13,11 @@ from ....aws.handlers.glue import Glue from ....aws.handlers.service_handlers import Worker from ....aws.handlers.sts import SessionHelper -from ....aws.handlers.sns import Sns from ....db import paginate, exceptions, permissions, models from ....db.api import Dataset, Environment, ShareObject, ResourcePolicy from ....db.api.organization import Organization -from ....searchproxy import indexers +from dataall.searchproxy import indexers +from dataall.searchproxy.indexers import DatasetIndexer log = logging.getLogger(__name__) @@ -34,8 +34,8 @@ def create_dataset(context: Context, source, input=None): ) Dataset.create_dataset_stack(session, dataset) - indexers.upsert_dataset( - session=session, es=context.es, datasetUri=dataset.datasetUri + DatasetIndexer.upsert( + session=session, dataset_uri=dataset.datasetUri ) stack_helper.deploy_dataset_stack(dataset) @@ -72,8 +72,8 @@ def import_dataset(context: Context, source, input=None): Dataset.create_dataset_stack(session, dataset) - indexers.upsert_dataset( - session=session, es=context.es, datasetUri=dataset.datasetUri + DatasetIndexer.upsert( + session=session, dataset_uri=dataset.datasetUri ) stack_helper.deploy_dataset_stack(dataset) @@ -220,7 +220,7 @@ def update_dataset(context, source, datasetUri: str = None, input: dict = None): data=input, check_perm=True, ) - indexers.upsert_dataset(session, context.es, datasetUri) + DatasetIndexer.upsert(session, dataset_uri=datasetUri) stack_helper.deploy_dataset_stack(updated_dataset) diff --git a/backend/dataall/api/Objects/DatasetTable/resolvers.py b/backend/dataall/api/Objects/DatasetTable/resolvers.py index 3e2b833e3..567985348 100644 --- a/backend/dataall/api/Objects/DatasetTable/resolvers.py +++ b/backend/dataall/api/Objects/DatasetTable/resolvers.py @@ -13,6 +13,7 @@ from ....db.api import ResourcePolicy, Glossary from ....searchproxy import indexers from ....utils import json_utils +from dataall.searchproxy.indexers import DatasetTableIndexer from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -28,7 +29,7 @@ def create_table(context, source, datasetUri: str = None, input: dict = None): data=input, check_perm=True, ) - indexers.upsert_table(session, context.es, table.tableUri) + DatasetTableIndexer.upsert(session, table_uri=table.tableUri) return table @@ -80,7 +81,7 @@ def update_table(context, source, tableUri: str = None, input: dict = None): data=input, check_perm=True, ) - indexers.upsert_table(session, context.es, table.tableUri) + DatasetTableIndexer.upsert(session, table_uri=table.tableUri) return table diff --git a/backend/dataall/api/Objects/Glossary/registry.py b/backend/dataall/api/Objects/Glossary/registry.py index 0f0cdb61f..cb82bf208 100644 --- a/backend/dataall/api/Objects/Glossary/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -6,7 +6,8 @@ from dataall.api import gql from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models -from dataall.searchproxy.indexers import upsert_dashboard, upsert_table, upsert_dataset +from dataall.searchproxy.indexers import DashboardIndexer, DatasetTableIndexer, DatasetIndexer +from dataall.searchproxy.upsert import BaseIndexer class Identifiable(Protocol): @@ -20,7 +21,7 @@ class GlossaryDefinition: target_type: str object_type: str model: Union[Type[Resource], Identifiable] # should be an intersection, but python typing doesn't have one yet - reindexer: Callable[[Any, OpenSearch, str], None] = None # a callback to reindex glossaries in open search + reindexer: Type[BaseIndexer] = None # a callback to reindex glossaries in open search def target_uri(self): return self.model.uri() @@ -58,25 +59,25 @@ def types(cls): def reindex(cls, session, es: OpenSearch, target_type: str, target_uri: str): definition = cls._DEFINITIONS[target_type] if definition.reindexer: - definition.reindexer(session, es, target_uri) + definition.reindexer.upsert(session, target_uri) GlossaryRegistry.register(GlossaryDefinition( target_type="Dashboard", object_type="Dashboard", model=models.Dashboard, - reindexer=upsert_dashboard + reindexer=DashboardIndexer )) GlossaryRegistry.register(GlossaryDefinition( target_type="DatasetTable", object_type="DatasetTable", model=models.DatasetTable, - reindexer=upsert_table + reindexer=DatasetTableIndexer )) GlossaryRegistry.register(GlossaryDefinition( target_type="Dataset", object_type="Dataset", model=models.Dataset, - reindexer=upsert_dataset + reindexer=DatasetIndexer )) diff --git a/backend/dataall/api/Objects/Vote/resolvers.py b/backend/dataall/api/Objects/Vote/resolvers.py index da41462cd..34dcd9f05 100644 --- a/backend/dataall/api/Objects/Vote/resolvers.py +++ b/backend/dataall/api/Objects/Vote/resolvers.py @@ -1,7 +1,6 @@ from .... import db from ....api.context import Context -from ....searchproxy.indexers import upsert_dashboard -from ....searchproxy.indexers import upsert_dataset +from dataall.searchproxy.indexers import DatasetIndexer, DashboardIndexer def count_upvotes( @@ -34,9 +33,9 @@ def upvote(context: Context, source, input=None): def reindex(session, es, vote): if vote.targetType == 'dataset': - upsert_dataset(session=session, es=es, datasetUri=vote.targetUri) + DatasetIndexer.upsert(session=session, dataset_uri=vote.targetUri) elif vote.targetType == 'dashboard': - upsert_dashboard(session=session, es=es, dashboardUri=vote.targetUri) + DashboardIndexer.upsert(session=session, dashboard_uri=vote.targetUri) def get_vote(context: Context, source, targetUri: str = None, targetType: str = None): diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 3e50f37fa..a2f600f68 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -6,7 +6,7 @@ from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation from dataall.modules.loader import ModuleInterface, ImportMode -from dataall.searchproxy.indexers import upsert_folder +from dataall.searchproxy.indexers import DatasetLocationIndexer log = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def __init__(self): target_type="Folder", object_type="DatasetStorageLocation", model=DatasetStorageLocation, - reindexer=upsert_folder + reindexer=DatasetLocationIndexer )) log.info("API of datasets has been imported") diff --git a/backend/dataall/modules/datasets/api/storage_location/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py index 32cfcfcac..4aebc1458 100644 --- a/backend/dataall/modules/datasets/api/storage_location/resolvers.py +++ b/backend/dataall/modules/datasets/api/storage_location/resolvers.py @@ -12,6 +12,7 @@ from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService +from dataall.searchproxy.indexers import DatasetLocationIndexer def create_storage_location( @@ -29,9 +30,7 @@ def create_storage_location( S3DatasetLocationHandler.create_bucket_prefix(location) - indexers.upsert_folder( - session=session, es=context.es, locationUri=location.locationUri - ) + DatasetLocationIndexer.upsert(session=session, folder_uri=location.locationUri) return location @@ -74,7 +73,7 @@ def update_storage_location( data=input, check_perm=True, ) - indexers.upsert_folder(session, context.es, location.locationUri) + DatasetLocationIndexer.upsert(session, folder_uri=location.locationUri) return location diff --git a/backend/dataall/searchproxy/__init__.py b/backend/dataall/searchproxy/__init__.py index 1a69dac6c..8b648babe 100644 --- a/backend/dataall/searchproxy/__init__.py +++ b/backend/dataall/searchproxy/__init__.py @@ -1,15 +1,10 @@ from .connect import connect -from .indexers import upsert_dataset -from .indexers import upsert_table from .indexers import upsert_dataset_tables from .search import run_query -from .upsert import upsert __all__ = [ 'connect', 'run_query', 'upsert', - 'upsert_dataset', - 'upsert_table', 'upsert_dataset_tables', ] diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 7361c2150..34c1c3d17 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -1,319 +1,300 @@ import logging from sqlalchemy import and_ -from sqlalchemy.orm import with_expression -from .upsert import upsert from .. import db from ..db import models +from dataall.searchproxy.upsert import BaseIndexer from dataall.modules.datasets.db.models import DatasetStorageLocation log = logging.getLogger(__name__) -def get_target_glossary_terms(session, targetUri): - q = ( - session.query(models.TermLink) - .options( - with_expression(models.TermLink.path, models.GlossaryNode.path), - with_expression(models.TermLink.label, models.GlossaryNode.label), - with_expression(models.TermLink.readme, models.GlossaryNode.readme), +class DatasetIndexer(BaseIndexer): + + @classmethod + def upsert(cls, session, dataset_uri: str): + dataset = ( + session.query( + models.Dataset.datasetUri.label('datasetUri'), + models.Dataset.name.label('name'), + models.Dataset.owner.label('owner'), + models.Dataset.label.label('label'), + models.Dataset.description.label('description'), + models.Dataset.confidentiality.label('classification'), + models.Dataset.tags.label('tags'), + models.Dataset.topics.label('topics'), + models.Dataset.region.label('region'), + models.Organization.organizationUri.label('orgUri'), + models.Organization.name.label('orgName'), + models.Environment.environmentUri.label('envUri'), + models.Environment.name.label('envName'), + models.Dataset.SamlAdminGroupName.label('admins'), + models.Dataset.GlueDatabaseName.label('database'), + models.Dataset.S3BucketName.label('source'), + models.Dataset.created, + models.Dataset.updated, + models.Dataset.deleted, + ) + .join( + models.Organization, + models.Dataset.organizationUri == models.Organization.organizationUri, + ) + .join( + models.Environment, + models.Dataset.environmentUri == models.Environment.environmentUri, + ) + .filter(models.Dataset.datasetUri == dataset_uri) + .first() ) - .join( - models.GlossaryNode, models.GlossaryNode.nodeUri == models.TermLink.nodeUri + count_tables = db.api.Dataset.count_dataset_tables(session, dataset_uri) + count_folders = db.api.Dataset.count_dataset_locations(session, dataset_uri) + count_upvotes = db.api.Vote.count_upvotes( + session, None, None, dataset_uri, {'targetType': 'dataset'} ) - .filter( - and_( - models.TermLink.targetUri == targetUri, - models.TermLink.approvedBySteward.is_(True), + + if dataset: + glossary = BaseIndexer._get_target_glossary_terms(session, dataset_uri) + BaseIndexer._index( + doc_id=dataset_uri, + doc={ + 'name': dataset.name, + 'owner': dataset.owner, + 'label': dataset.label, + 'admins': dataset.admins, + 'database': dataset.database, + 'source': dataset.source, + 'resourceKind': 'dataset', + 'description': dataset.description, + 'classification': dataset.classification, + 'tags': [t.replace('-', '') for t in dataset.tags or []], + 'topics': dataset.topics, + 'region': dataset.region.replace('-', ''), + 'environmentUri': dataset.envUri, + 'environmentName': dataset.envName, + 'organizationUri': dataset.orgUri, + 'organizationName': dataset.orgName, + 'created': dataset.created, + 'updated': dataset.updated, + 'deleted': dataset.deleted, + 'glossary': glossary, + 'tables': count_tables, + 'folders': count_folders, + 'upvotes': count_upvotes, + }, ) - ) - ) - return [t.path for t in q] + return dataset -def upsert_dataset(session, es, datasetUri: str): - dataset = ( - session.query( - models.Dataset.datasetUri.label('datasetUri'), - models.Dataset.name.label('name'), - models.Dataset.owner.label('owner'), - models.Dataset.label.label('label'), - models.Dataset.description.label('description'), - models.Dataset.confidentiality.label('classification'), - models.Dataset.tags.label('tags'), - models.Dataset.topics.label('topics'), - models.Dataset.region.label('region'), - models.Organization.organizationUri.label('orgUri'), - models.Organization.name.label('orgName'), - models.Environment.environmentUri.label('envUri'), - models.Environment.name.label('envName'), - models.Dataset.SamlAdminGroupName.label('admins'), - models.Dataset.GlueDatabaseName.label('database'), - models.Dataset.S3BucketName.label('source'), - models.Dataset.created, - models.Dataset.updated, - models.Dataset.deleted, - ) - .join( - models.Organization, - models.Dataset.organizationUri == models.Organization.organizationUri, - ) - .join( - models.Environment, - models.Dataset.environmentUri == models.Environment.environmentUri, - ) - .filter(models.Dataset.datasetUri == datasetUri) - .first() - ) - count_tables = db.api.Dataset.count_dataset_tables(session, datasetUri) - count_folders = db.api.Dataset.count_dataset_locations(session, datasetUri) - count_upvotes = db.api.Vote.count_upvotes( - session, None, None, datasetUri, {'targetType': 'dataset'} - ) +class DatasetTableIndexer(BaseIndexer): - if dataset: - glossary = get_target_glossary_terms(session, datasetUri) - upsert( - es=es, - index='dataall-index', - id=datasetUri, - doc={ - 'name': dataset.name, - 'owner': dataset.owner, - 'label': dataset.label, - 'admins': dataset.admins, - 'database': dataset.database, - 'source': dataset.source, - 'resourceKind': 'dataset', - 'description': dataset.description, - 'classification': dataset.classification, - 'tags': [t.replace('-', '') for t in dataset.tags or []], - 'topics': dataset.topics, - 'region': dataset.region.replace('-', ''), - 'environmentUri': dataset.envUri, - 'environmentName': dataset.envName, - 'organizationUri': dataset.orgUri, - 'organizationName': dataset.orgName, - 'created': dataset.created, - 'updated': dataset.updated, - 'deleted': dataset.deleted, - 'glossary': glossary, - 'tables': count_tables, - 'folders': count_folders, - 'upvotes': count_upvotes, - }, + @classmethod + def upsert(cls, session, table_uri: str): + table = ( + session.query( + models.DatasetTable.datasetUri.label('datasetUri'), + models.DatasetTable.tableUri.label('uri'), + models.DatasetTable.name.label('name'), + models.DatasetTable.owner.label('owner'), + models.DatasetTable.label.label('label'), + models.DatasetTable.description.label('description'), + models.Dataset.confidentiality.label('classification'), + models.DatasetTable.tags.label('tags'), + models.Dataset.topics.label('topics'), + models.Dataset.region.label('region'), + models.Organization.organizationUri.label('orgUri'), + models.Organization.name.label('orgName'), + models.Environment.environmentUri.label('envUri'), + models.Environment.name.label('envName'), + models.Dataset.SamlAdminGroupName.label('admins'), + models.Dataset.GlueDatabaseName.label('database'), + models.Dataset.S3BucketName.label('source'), + models.DatasetTable.created, + models.DatasetTable.updated, + models.DatasetTable.deleted, + ) + .join( + models.Dataset, + models.Dataset.datasetUri == models.DatasetTable.datasetUri, + ) + .join( + models.Organization, + models.Dataset.organizationUri == models.Organization.organizationUri, + ) + .join( + models.Environment, + models.Dataset.environmentUri == models.Environment.environmentUri, + ) + .filter(models.DatasetTable.tableUri == table_uri) + .first() ) - return dataset - -def upsert_table(session, es, tableUri: str): - table = ( - session.query( - models.DatasetTable.datasetUri.label('datasetUri'), - models.DatasetTable.tableUri.label('uri'), - models.DatasetTable.name.label('name'), - models.DatasetTable.owner.label('owner'), - models.DatasetTable.label.label('label'), - models.DatasetTable.description.label('description'), - models.Dataset.confidentiality.label('classification'), - models.DatasetTable.tags.label('tags'), - models.Dataset.topics.label('topics'), - models.Dataset.region.label('region'), - models.Organization.organizationUri.label('orgUri'), - models.Organization.name.label('orgName'), - models.Environment.environmentUri.label('envUri'), - models.Environment.name.label('envName'), - models.Dataset.SamlAdminGroupName.label('admins'), - models.Dataset.GlueDatabaseName.label('database'), - models.Dataset.S3BucketName.label('source'), - models.DatasetTable.created, - models.DatasetTable.updated, - models.DatasetTable.deleted, - ) - .join( - models.Dataset, - models.Dataset.datasetUri == models.DatasetTable.datasetUri, - ) - .join( - models.Organization, - models.Dataset.organizationUri == models.Organization.organizationUri, - ) - .join( - models.Environment, - models.Dataset.environmentUri == models.Environment.environmentUri, - ) - .filter(models.DatasetTable.tableUri == tableUri) - .first() - ) + if table: + glossary = BaseIndexer._get_target_glossary_terms(session, table_uri) + tags = table.tags if table.tags else [] + BaseIndexer._index( + doc_id=table_uri, + doc={ + 'name': table.name, + 'admins': table.admins, + 'owner': table.owner, + 'label': table.label, + 'resourceKind': 'table', + 'description': table.description, + 'database': table.database, + 'source': table.source, + 'classification': table.classification, + 'tags': [t.replace('-', '') for t in tags or []], + 'topics': table.topics, + 'region': table.region.replace('-', ''), + 'datasetUri': table.datasetUri, + 'environmentUri': table.envUri, + 'environmentName': table.envName, + 'organizationUri': table.orgUri, + 'organizationName': table.orgName, + 'created': table.created, + 'updated': table.updated, + 'deleted': table.deleted, + 'glossary': glossary, + }, + ) + DatasetIndexer.upsert(session=session, dataset_uri=table.datasetUri) + return table - if table: - glossary = get_target_glossary_terms(session, tableUri) - tags = table.tags if table.tags else [] - upsert( - es=es, - index='dataall-index', - id=tableUri, - doc={ - 'name': table.name, - 'admins': table.admins, - 'owner': table.owner, - 'label': table.label, - 'resourceKind': 'table', - 'description': table.description, - 'database': table.database, - 'source': table.source, - 'classification': table.classification, - 'tags': [t.replace('-', '') for t in tags or []], - 'topics': table.topics, - 'region': table.region.replace('-', ''), - 'datasetUri': table.datasetUri, - 'environmentUri': table.envUri, - 'environmentName': table.envName, - 'organizationUri': table.orgUri, - 'organizationName': table.orgName, - 'created': table.created, - 'updated': table.updated, - 'deleted': table.deleted, - 'glossary': glossary, - }, - ) - upsert_dataset(session, es, table.datasetUri) - return table +class DatasetLocationIndexer(BaseIndexer): -def upsert_folder(session, es, locationUri: str): - folder = ( - session.query( - DatasetStorageLocation.datasetUri.label('datasetUri'), - DatasetStorageLocation.locationUri.label('uri'), - DatasetStorageLocation.name.label('name'), - DatasetStorageLocation.owner.label('owner'), - DatasetStorageLocation.label.label('label'), - DatasetStorageLocation.description.label('description'), - DatasetStorageLocation.tags.label('tags'), - DatasetStorageLocation.region.label('region'), - models.Organization.organizationUri.label('orgUri'), - models.Organization.name.label('orgName'), - models.Environment.environmentUri.label('envUri'), - models.Environment.name.label('envName'), - models.Dataset.SamlAdminGroupName.label('admins'), - models.Dataset.S3BucketName.label('source'), - models.Dataset.topics.label('topics'), - models.Dataset.confidentiality.label('classification'), - DatasetStorageLocation.created, - DatasetStorageLocation.updated, - DatasetStorageLocation.deleted, - ) - .join( - models.Dataset, - models.Dataset.datasetUri == DatasetStorageLocation.datasetUri, - ) - .join( - models.Organization, - models.Dataset.organizationUri == models.Organization.organizationUri, - ) - .join( - models.Environment, - models.Dataset.environmentUri == models.Environment.environmentUri, - ) - .filter(DatasetStorageLocation.locationUri == locationUri) - .first() - ) - if folder: - glossary = get_target_glossary_terms(session, locationUri) - upsert( - es=es, - index='dataall-index', - id=locationUri, - doc={ - 'name': folder.name, - 'admins': folder.admins, - 'owner': folder.owner, - 'label': folder.label, - 'resourceKind': 'folder', - 'description': folder.description, - 'source': folder.source, - 'classification': folder.classification, - 'tags': [f.replace('-', '') for f in folder.tags or []], - 'topics': folder.topics, - 'region': folder.region.replace('-', ''), - 'datasetUri': folder.datasetUri, - 'environmentUri': folder.envUri, - 'environmentName': folder.envName, - 'organizationUri': folder.orgUri, - 'organizationName': folder.orgName, - 'created': folder.created, - 'updated': folder.updated, - 'deleted': folder.deleted, - 'glossary': glossary, - }, + @classmethod + def upsert(cls, session, folder_uri: str): + folder = ( + session.query( + DatasetStorageLocation.datasetUri.label('datasetUri'), + DatasetStorageLocation.locationUri.label('uri'), + DatasetStorageLocation.name.label('name'), + DatasetStorageLocation.owner.label('owner'), + DatasetStorageLocation.label.label('label'), + DatasetStorageLocation.description.label('description'), + DatasetStorageLocation.tags.label('tags'), + DatasetStorageLocation.region.label('region'), + models.Organization.organizationUri.label('orgUri'), + models.Organization.name.label('orgName'), + models.Environment.environmentUri.label('envUri'), + models.Environment.name.label('envName'), + models.Dataset.SamlAdminGroupName.label('admins'), + models.Dataset.S3BucketName.label('source'), + models.Dataset.topics.label('topics'), + models.Dataset.confidentiality.label('classification'), + DatasetStorageLocation.created, + DatasetStorageLocation.updated, + DatasetStorageLocation.deleted, + ) + .join( + models.Dataset, + models.Dataset.datasetUri == DatasetStorageLocation.datasetUri, + ) + .join( + models.Organization, + models.Dataset.organizationUri == models.Organization.organizationUri, + ) + .join( + models.Environment, + models.Dataset.environmentUri == models.Environment.environmentUri, + ) + .filter(DatasetStorageLocation.locationUri == folder_uri) + .first() ) - upsert_dataset(session, es, folder.datasetUri) - return folder + if folder: + glossary = BaseIndexer._get_target_glossary_terms(session, folder_uri) + BaseIndexer._index( + doc_id=folder_uri, + doc={ + 'name': folder.name, + 'admins': folder.admins, + 'owner': folder.owner, + 'label': folder.label, + 'resourceKind': 'folder', + 'description': folder.description, + 'source': folder.source, + 'classification': folder.classification, + 'tags': [f.replace('-', '') for f in folder.tags or []], + 'topics': folder.topics, + 'region': folder.region.replace('-', ''), + 'datasetUri': folder.datasetUri, + 'environmentUri': folder.envUri, + 'environmentName': folder.envName, + 'organizationUri': folder.orgUri, + 'organizationName': folder.orgName, + 'created': folder.created, + 'updated': folder.updated, + 'deleted': folder.deleted, + 'glossary': glossary, + }, + ) + DatasetIndexer.upsert(session=session, dataset_uri=folder.datasetUri) + return folder -def upsert_dashboard(session, es, dashboardUri: str): - dashboard = ( - session.query( - models.Dashboard.dashboardUri.label('uri'), - models.Dashboard.name.label('name'), - models.Dashboard.owner.label('owner'), - models.Dashboard.label.label('label'), - models.Dashboard.description.label('description'), - models.Dashboard.tags.label('tags'), - models.Dashboard.region.label('region'), - models.Organization.organizationUri.label('orgUri'), - models.Organization.name.label('orgName'), - models.Environment.environmentUri.label('envUri'), - models.Environment.name.label('envName'), - models.Dashboard.SamlGroupName.label('admins'), - models.Dashboard.created, - models.Dashboard.updated, - models.Dashboard.deleted, - ) - .join( - models.Organization, - models.Dashboard.organizationUri == models.Dashboard.organizationUri, - ) - .join( - models.Environment, - models.Dashboard.environmentUri == models.Environment.environmentUri, - ) - .filter(models.Dashboard.dashboardUri == dashboardUri) - .first() - ) - if dashboard: - glossary = get_target_glossary_terms(session, dashboardUri) - count_upvotes = db.api.Vote.count_upvotes( - session, None, None, dashboardUri, {'targetType': 'dashboard'} - ) - upsert( - es=es, - index='dataall-index', - id=dashboardUri, - doc={ - 'name': dashboard.name, - 'admins': dashboard.admins, - 'owner': dashboard.owner, - 'label': dashboard.label, - 'resourceKind': 'dashboard', - 'description': dashboard.description, - 'tags': [f.replace('-', '') for f in dashboard.tags or []], - 'topics': [], - 'region': dashboard.region.replace('-', ''), - 'environmentUri': dashboard.envUri, - 'environmentName': dashboard.envName, - 'organizationUri': dashboard.orgUri, - 'organizationName': dashboard.orgName, - 'created': dashboard.created, - 'updated': dashboard.updated, - 'deleted': dashboard.deleted, - 'glossary': glossary, - 'upvotes': count_upvotes, - }, +class DashboardIndexer(BaseIndexer): + @classmethod + def upsert(cls, session, dashboard_uri: str): + dashboard = ( + session.query( + models.Dashboard.dashboardUri.label('uri'), + models.Dashboard.name.label('name'), + models.Dashboard.owner.label('owner'), + models.Dashboard.label.label('label'), + models.Dashboard.description.label('description'), + models.Dashboard.tags.label('tags'), + models.Dashboard.region.label('region'), + models.Organization.organizationUri.label('orgUri'), + models.Organization.name.label('orgName'), + models.Environment.environmentUri.label('envUri'), + models.Environment.name.label('envName'), + models.Dashboard.SamlGroupName.label('admins'), + models.Dashboard.created, + models.Dashboard.updated, + models.Dashboard.deleted, + ) + .join( + models.Organization, + models.Dashboard.organizationUri == models.Dashboard.organizationUri, + ) + .join( + models.Environment, + models.Dashboard.environmentUri == models.Environment.environmentUri, + ) + .filter(models.Dashboard.dashboardUri == dashboard_uri) + .first() ) - return dashboard + if dashboard: + glossary = BaseIndexer._get_target_glossary_terms(session, dashboard_uri) + count_upvotes = db.api.Vote.count_upvotes( + session, None, None, dashboard_uri, {'targetType': 'dashboard'} + ) + BaseIndexer._index( + doc_id=dashboard_uri, + doc={ + 'name': dashboard.name, + 'admins': dashboard.admins, + 'owner': dashboard.owner, + 'label': dashboard.label, + 'resourceKind': 'dashboard', + 'description': dashboard.description, + 'tags': [f.replace('-', '') for f in dashboard.tags or []], + 'topics': [], + 'region': dashboard.region.replace('-', ''), + 'environmentUri': dashboard.envUri, + 'environmentName': dashboard.envName, + 'organizationUri': dashboard.orgUri, + 'organizationName': dashboard.orgName, + 'created': dashboard.created, + 'updated': dashboard.updated, + 'deleted': dashboard.deleted, + 'glossary': glossary, + 'upvotes': count_upvotes, + }, + ) + return dashboard def upsert_dataset_tables(session, es, datasetUri: str): @@ -328,7 +309,7 @@ def upsert_dataset_tables(session, es, datasetUri: str): .all() ) for table in tables: - upsert_table(session, es, table.tableUri) + DatasetTableIndexer.upsert(session=session, table_uri=table.tableUri) return tables @@ -355,7 +336,7 @@ def upsert_dataset_folders(session, es, datasetUri: str): .all() ) for folder in folders: - upsert_folder(session, es, folder.locationUri) + DatasetLocationIndexer.upsert(session=session, folder_uri=folder.locationUri) return folders diff --git a/backend/dataall/searchproxy/upsert.py b/backend/dataall/searchproxy/upsert.py index 0fd9735e5..9eb2e3125 100644 --- a/backend/dataall/searchproxy/upsert.py +++ b/backend/dataall/searchproxy/upsert.py @@ -1,15 +1,65 @@ import logging +import os +from abc import ABC, abstractmethod from datetime import datetime +from operator import and_ + +from sqlalchemy.orm import with_expression + +from dataall.db import models +from dataall.searchproxy import connect log = logging.getLogger(__name__) -def upsert(es, index, id, doc): - doc['_indexed'] = datetime.now() - if es: - res = es.index(index=index, id=id, body=doc) - log.info(f'doc {doc} for id {id} indexed with response {res}') - return True - else: - log.error(f'ES config is missing doc {doc} for id {id} was not indexed') - return False +class BaseIndexer(ABC): + """API to work with OpenSearch""" + _INDEX = 'dataall-index' + _es = None + + @classmethod + def es(cls): + """Lazy creation of the OpenSearch connection""" + if cls._es is None: + cls._es = connect(envname=os.getenv('envname', 'local')) + + return cls._es + + @staticmethod + @abstractmethod + def upsert(session, target_id): + raise NotImplementedError("Method upsert is not implemented") + + @classmethod + def _index(cls, doc_id, doc): + es = cls.es() + doc['_indexed'] = datetime.now() + if es: + res = es.index(index=BaseIndexer._INDEX, id=doc_id, body=doc) + log.info(f'doc {doc} for id {doc_id} indexed with response {res}') + return True + else: + log.error(f'ES config is missing doc {doc} for id {doc_id} was not indexed') + return False + + @staticmethod + def _get_target_glossary_terms(session, target_uri): + q = ( + session.query(models.TermLink) + .options( + with_expression(models.TermLink.path, models.GlossaryNode.path), + with_expression(models.TermLink.label, models.GlossaryNode.label), + with_expression(models.TermLink.readme, models.GlossaryNode.readme), + ) + .join( + models.GlossaryNode, models.GlossaryNode.nodeUri == models.TermLink.nodeUri + ) + .filter( + and_( + models.TermLink.targetUri == target_uri, + models.TermLink.approvedBySteward.is_(True), + ) + ) + ) + return [t.path for t in q] + diff --git a/backend/dataall/tasks/catalog_indexer.py b/backend/dataall/tasks/catalog_indexer.py index 2a53880c8..9c70ce9ca 100644 --- a/backend/dataall/tasks/catalog_indexer.py +++ b/backend/dataall/tasks/catalog_indexer.py @@ -5,7 +5,7 @@ from .. import db from ..db import get_engine, exceptions from ..db import models -from ..searchproxy import indexers +from dataall.searchproxy.indexers import upsert_dataset_tables, upsert_dataset_folders, DashboardIndexer from ..searchproxy.connect import ( connect, ) @@ -33,8 +33,8 @@ def index_objects(engine, es): log.info(f'Found {len(all_datasets)} datasets') dataset: models.Dataset for dataset in all_datasets: - tables = indexers.upsert_dataset_tables(session, es, dataset.datasetUri) - folders = indexers.upsert_dataset_folders( + tables = upsert_dataset_tables(session, es, dataset.datasetUri) + folders = upsert_dataset_folders( session, es, dataset.datasetUri ) indexed_objects_counter = ( @@ -45,7 +45,7 @@ def index_objects(engine, es): log.info(f'Found {len(all_dashboards)} dashboards') dashboard: models.Dashboard for dashboard in all_dashboards: - indexers.upsert_dashboard(session, es, dashboard.dashboardUri) + DashboardIndexer.upsert(session=session, dashboard_uri=dashboard.dashboardUri) indexed_objects_counter = indexed_objects_counter + 1 log.info(f'Successfully indexed {indexed_objects_counter} objects') diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 8334f7700..4a160d818 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -1,3 +1,4 @@ +import dataall.searchproxy.indexers from .client import * from dataall.db import models from dataall.api import constants @@ -29,10 +30,10 @@ def patch_es(module_mocker): module_mocker.patch('dataall.searchproxy.search', return_value={}) module_mocker.patch('dataall.searchproxy.upsert', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.upsert_dataset_tables', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.upsert_dataset', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.upsert_table', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.upsert_folder', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.upsert_dashboard', return_value={}) + module_mocker.patch('dataall.searchproxy.indexers.DatasetIndexer.upsert', return_value={}) + module_mocker.patch('dataall.searchproxy.indexers.DatasetTableIndexer.upsert', return_value={}) + module_mocker.patch('dataall.searchproxy.indexers.DatasetLocationIndexer.upsert', return_value={}) + module_mocker.patch('dataall.searchproxy.indexers.DashboardIndexer.upsert', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.delete_doc', return_value={}) diff --git a/tests/searchproxy/test_indexers.py b/tests/searchproxy/test_indexers.py index fcdb18bb0..b9c823957 100644 --- a/tests/searchproxy/test_indexers.py +++ b/tests/searchproxy/test_indexers.py @@ -5,6 +5,11 @@ import dataall from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.searchproxy.indexers import ( + DatasetIndexer, + DatasetTableIndexer, + DatasetLocationIndexer, +) @pytest.fixture(scope='module', autouse=True) @@ -124,8 +129,8 @@ def test_es_request(): def test_upsert_dataset(db, dataset, env, mocker): mocker.patch('dataall.searchproxy.upsert', return_value={}) with db.scoped_session() as session: - dataset_indexed = indexers.upsert_dataset( - session, es={}, datasetUri=dataset.datasetUri + dataset_indexed = DatasetIndexer.upsert( + session, dataset_uri=dataset.datasetUri ) assert dataset_indexed.datasetUri == dataset.datasetUri @@ -133,15 +138,15 @@ def test_upsert_dataset(db, dataset, env, mocker): def test_upsert_table(db, dataset, env, mocker, table): mocker.patch('dataall.searchproxy.upsert', return_value={}) with db.scoped_session() as session: - table_indexed = indexers.upsert_table(session, es={}, tableUri=table.tableUri) + table_indexed = DatasetTableIndexer.upsert(session, table_uri=table.tableUri) assert table_indexed.uri == table.tableUri def test_upsert_folder(db, dataset, env, mocker, folder): mocker.patch('dataall.searchproxy.upsert', return_value={}) with db.scoped_session() as session: - folder_indexed = indexers.upsert_folder( - session, es={}, locationUri=folder.locationUri + folder_indexed = DatasetLocationIndexer.upsert( + session=session, folder_uri=folder.locationUri ) assert folder_indexed.uri == folder.locationUri diff --git a/tests/tasks/test_catalog_indexer.py b/tests/tasks/test_catalog_indexer.py index 77090b2d4..d6e73e4c6 100644 --- a/tests/tasks/test_catalog_indexer.py +++ b/tests/tasks/test_catalog_indexer.py @@ -86,7 +86,7 @@ def test_catalog_indexer(db, org, env, sync_dataset, table, mocker): 'dataall.searchproxy.indexers.upsert_dataset_tables', return_value=[table] ) mocker.patch( - 'dataall.searchproxy.indexers.upsert_dataset', return_value=sync_dataset + 'dataall.searchproxy.indexers.DatasetIndexer.upsert', return_value=sync_dataset ) indexed_objects_counter = dataall.tasks.catalog_indexer.index_objects( engine=db, es=True From 4e31b991e7b09137a31559e1e3abc77d7820d283 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 19 Apr 2023 16:41:48 +0200 Subject: [PATCH 50/67] Extracted upsert_dataset_folders into DatasetLocationIndexer and renamed it --- backend/dataall/searchproxy/indexers.py | 22 +++++++++++----------- backend/dataall/tasks/catalog_indexer.py | 6 ++---- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 34c1c3d17..a58d832eb 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -233,6 +233,17 @@ def upsert(cls, session, folder_uri: str): DatasetIndexer.upsert(session=session, dataset_uri=folder.datasetUri) return folder + @classmethod + def upsert_all(cls, session, dataset_uri: str): + folders = ( + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == dataset_uri) + .all() + ) + for folder in folders: + DatasetLocationIndexer.upsert(session=session, folder_uri=folder.locationUri) + return folders + class DashboardIndexer(BaseIndexer): @classmethod @@ -329,17 +340,6 @@ def remove_deleted_tables(session, es, datasetUri: str): return tables -def upsert_dataset_folders(session, es, datasetUri: str): - folders = ( - session.query(DatasetStorageLocation) - .filter(DatasetStorageLocation.datasetUri == datasetUri) - .all() - ) - for folder in folders: - DatasetLocationIndexer.upsert(session=session, folder_uri=folder.locationUri) - return folders - - def delete_doc(es, doc_id, index='dataall-index'): es.delete(index=index, id=doc_id, ignore=[400, 404]) return True diff --git a/backend/dataall/tasks/catalog_indexer.py b/backend/dataall/tasks/catalog_indexer.py index 9c70ce9ca..b951e7e83 100644 --- a/backend/dataall/tasks/catalog_indexer.py +++ b/backend/dataall/tasks/catalog_indexer.py @@ -5,7 +5,7 @@ from .. import db from ..db import get_engine, exceptions from ..db import models -from dataall.searchproxy.indexers import upsert_dataset_tables, upsert_dataset_folders, DashboardIndexer +from dataall.searchproxy.indexers import upsert_dataset_tables, DashboardIndexer, DatasetLocationIndexer from ..searchproxy.connect import ( connect, ) @@ -34,9 +34,7 @@ def index_objects(engine, es): dataset: models.Dataset for dataset in all_datasets: tables = upsert_dataset_tables(session, es, dataset.datasetUri) - folders = upsert_dataset_folders( - session, es, dataset.datasetUri - ) + folders = DatasetLocationIndexer.upsert_all(session, dataset_uri=dataset.datasetUri) indexed_objects_counter = ( indexed_objects_counter + len(tables) + len(folders) + 1 ) From b7728125afdce3a3e42d75f96593b3aeae55665e Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 19 Apr 2023 16:51:39 +0200 Subject: [PATCH 51/67] Moved DatasetLocationIndexer into the dataset module --- backend/dataall/modules/datasets/__init__.py | 2 +- .../api/storage_location/resolvers.py | 3 +- .../modules/datasets/indexers/__init__.py | 1 + .../datasets/indexers/location_indexer.py | 89 +++++++++++++++++++ backend/dataall/searchproxy/indexers.py | 84 ----------------- backend/dataall/tasks/catalog_indexer.py | 3 +- tests/api/conftest.py | 5 +- tests/searchproxy/test_indexers.py | 2 +- 8 files changed, 99 insertions(+), 90 deletions(-) create mode 100644 backend/dataall/modules/datasets/indexers/__init__.py create mode 100644 backend/dataall/modules/datasets/indexers/location_indexer.py diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index a2f600f68..b976764ce 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -5,8 +5,8 @@ from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation +from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.modules.loader import ModuleInterface, ImportMode -from dataall.searchproxy.indexers import DatasetLocationIndexer log = logging.getLogger(__name__) diff --git a/backend/dataall/modules/datasets/api/storage_location/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py index 4aebc1458..1b6dcdb92 100644 --- a/backend/dataall/modules/datasets/api/storage_location/resolvers.py +++ b/backend/dataall/modules/datasets/api/storage_location/resolvers.py @@ -1,6 +1,5 @@ from dataall.api.context import Context from dataall.aws.handlers.service_handlers import Worker -from dataall.aws.handlers.s3 import S3 from dataall.db import permissions, models from dataall.db.api import ( ResourcePolicy, @@ -9,10 +8,10 @@ Environment, ) from dataall.modules.datasets.handlers.s3_location_handler import S3DatasetLocationHandler +from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService -from dataall.searchproxy.indexers import DatasetLocationIndexer def create_storage_location( diff --git a/backend/dataall/modules/datasets/indexers/__init__.py b/backend/dataall/modules/datasets/indexers/__init__.py new file mode 100644 index 000000000..faf66363b --- /dev/null +++ b/backend/dataall/modules/datasets/indexers/__init__.py @@ -0,0 +1 @@ +"""Contains dataset related indexers for OpenSearch""" diff --git a/backend/dataall/modules/datasets/indexers/location_indexer.py b/backend/dataall/modules/datasets/indexers/location_indexer.py new file mode 100644 index 000000000..9a3147e12 --- /dev/null +++ b/backend/dataall/modules/datasets/indexers/location_indexer.py @@ -0,0 +1,89 @@ +"""Indexes DatasetStorageLocation in OpenSearch""" +from dataall.modules.datasets.db.models import DatasetStorageLocation + +from dataall.db import models +from dataall.searchproxy.indexers import DatasetIndexer +from dataall.searchproxy.upsert import BaseIndexer + + +class DatasetLocationIndexer(BaseIndexer): + + @classmethod + def upsert(cls, session, folder_uri: str): + folder = ( + session.query( + DatasetStorageLocation.datasetUri.label('datasetUri'), + DatasetStorageLocation.locationUri.label('uri'), + DatasetStorageLocation.name.label('name'), + DatasetStorageLocation.owner.label('owner'), + DatasetStorageLocation.label.label('label'), + DatasetStorageLocation.description.label('description'), + DatasetStorageLocation.tags.label('tags'), + DatasetStorageLocation.region.label('region'), + models.Organization.organizationUri.label('orgUri'), + models.Organization.name.label('orgName'), + models.Environment.environmentUri.label('envUri'), + models.Environment.name.label('envName'), + models.Dataset.SamlAdminGroupName.label('admins'), + models.Dataset.S3BucketName.label('source'), + models.Dataset.topics.label('topics'), + models.Dataset.confidentiality.label('classification'), + DatasetStorageLocation.created, + DatasetStorageLocation.updated, + DatasetStorageLocation.deleted, + ) + .join( + models.Dataset, + models.Dataset.datasetUri == DatasetStorageLocation.datasetUri, + ) + .join( + models.Organization, + models.Dataset.organizationUri == models.Organization.organizationUri, + ) + .join( + models.Environment, + models.Dataset.environmentUri == models.Environment.environmentUri, + ) + .filter(DatasetStorageLocation.locationUri == folder_uri) + .first() + ) + if folder: + glossary = BaseIndexer._get_target_glossary_terms(session, folder_uri) + BaseIndexer._index( + doc_id=folder_uri, + doc={ + 'name': folder.name, + 'admins': folder.admins, + 'owner': folder.owner, + 'label': folder.label, + 'resourceKind': 'folder', + 'description': folder.description, + 'source': folder.source, + 'classification': folder.classification, + 'tags': [f.replace('-', '') for f in folder.tags or []], + 'topics': folder.topics, + 'region': folder.region.replace('-', ''), + 'datasetUri': folder.datasetUri, + 'environmentUri': folder.envUri, + 'environmentName': folder.envName, + 'organizationUri': folder.orgUri, + 'organizationName': folder.orgName, + 'created': folder.created, + 'updated': folder.updated, + 'deleted': folder.deleted, + 'glossary': glossary, + }, + ) + DatasetIndexer.upsert(session=session, dataset_uri=folder.datasetUri) + return folder + + @classmethod + def upsert_all(cls, session, dataset_uri: str): + folders = ( + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == dataset_uri) + .all() + ) + for folder in folders: + DatasetLocationIndexer.upsert(session=session, folder_uri=folder.locationUri) + return folders diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index a58d832eb..601945509 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -5,7 +5,6 @@ from .. import db from ..db import models from dataall.searchproxy.upsert import BaseIndexer -from dataall.modules.datasets.db.models import DatasetStorageLocation log = logging.getLogger(__name__) @@ -162,89 +161,6 @@ def upsert(cls, session, table_uri: str): return table -class DatasetLocationIndexer(BaseIndexer): - - @classmethod - def upsert(cls, session, folder_uri: str): - folder = ( - session.query( - DatasetStorageLocation.datasetUri.label('datasetUri'), - DatasetStorageLocation.locationUri.label('uri'), - DatasetStorageLocation.name.label('name'), - DatasetStorageLocation.owner.label('owner'), - DatasetStorageLocation.label.label('label'), - DatasetStorageLocation.description.label('description'), - DatasetStorageLocation.tags.label('tags'), - DatasetStorageLocation.region.label('region'), - models.Organization.organizationUri.label('orgUri'), - models.Organization.name.label('orgName'), - models.Environment.environmentUri.label('envUri'), - models.Environment.name.label('envName'), - models.Dataset.SamlAdminGroupName.label('admins'), - models.Dataset.S3BucketName.label('source'), - models.Dataset.topics.label('topics'), - models.Dataset.confidentiality.label('classification'), - DatasetStorageLocation.created, - DatasetStorageLocation.updated, - DatasetStorageLocation.deleted, - ) - .join( - models.Dataset, - models.Dataset.datasetUri == DatasetStorageLocation.datasetUri, - ) - .join( - models.Organization, - models.Dataset.organizationUri == models.Organization.organizationUri, - ) - .join( - models.Environment, - models.Dataset.environmentUri == models.Environment.environmentUri, - ) - .filter(DatasetStorageLocation.locationUri == folder_uri) - .first() - ) - if folder: - glossary = BaseIndexer._get_target_glossary_terms(session, folder_uri) - BaseIndexer._index( - doc_id=folder_uri, - doc={ - 'name': folder.name, - 'admins': folder.admins, - 'owner': folder.owner, - 'label': folder.label, - 'resourceKind': 'folder', - 'description': folder.description, - 'source': folder.source, - 'classification': folder.classification, - 'tags': [f.replace('-', '') for f in folder.tags or []], - 'topics': folder.topics, - 'region': folder.region.replace('-', ''), - 'datasetUri': folder.datasetUri, - 'environmentUri': folder.envUri, - 'environmentName': folder.envName, - 'organizationUri': folder.orgUri, - 'organizationName': folder.orgName, - 'created': folder.created, - 'updated': folder.updated, - 'deleted': folder.deleted, - 'glossary': glossary, - }, - ) - DatasetIndexer.upsert(session=session, dataset_uri=folder.datasetUri) - return folder - - @classmethod - def upsert_all(cls, session, dataset_uri: str): - folders = ( - session.query(DatasetStorageLocation) - .filter(DatasetStorageLocation.datasetUri == dataset_uri) - .all() - ) - for folder in folders: - DatasetLocationIndexer.upsert(session=session, folder_uri=folder.locationUri) - return folders - - class DashboardIndexer(BaseIndexer): @classmethod def upsert(cls, session, dashboard_uri: str): diff --git a/backend/dataall/tasks/catalog_indexer.py b/backend/dataall/tasks/catalog_indexer.py index b951e7e83..e3d80458e 100644 --- a/backend/dataall/tasks/catalog_indexer.py +++ b/backend/dataall/tasks/catalog_indexer.py @@ -2,10 +2,11 @@ import os import sys +from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from .. import db from ..db import get_engine, exceptions from ..db import models -from dataall.searchproxy.indexers import upsert_dataset_tables, DashboardIndexer, DatasetLocationIndexer +from dataall.searchproxy.indexers import upsert_dataset_tables, DashboardIndexer from ..searchproxy.connect import ( connect, ) diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 4a160d818..f61ad46ef 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -32,7 +32,10 @@ def patch_es(module_mocker): module_mocker.patch('dataall.searchproxy.indexers.upsert_dataset_tables', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.DatasetIndexer.upsert', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.DatasetTableIndexer.upsert', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.DatasetLocationIndexer.upsert', return_value={}) + module_mocker.patch( + 'dataall.modules.datasets.indexers.location_indexer.DatasetLocationIndexer.upsert', + return_value={} + ) module_mocker.patch('dataall.searchproxy.indexers.DashboardIndexer.upsert', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.delete_doc', return_value={}) diff --git a/tests/searchproxy/test_indexers.py b/tests/searchproxy/test_indexers.py index b9c823957..eda5a7dd7 100644 --- a/tests/searchproxy/test_indexers.py +++ b/tests/searchproxy/test_indexers.py @@ -3,12 +3,12 @@ import pytest import dataall +from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation from dataall.searchproxy.indexers import ( DatasetIndexer, DatasetTableIndexer, - DatasetLocationIndexer, ) From cd798e2c5537f546a3f8a7fcac3f4e07358ddb4c Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 10:26:46 +0200 Subject: [PATCH 52/67] Moved DatasetStorageLocation methods to the service --- .../dataall/api/Objects/Dataset/resolvers.py | 7 ++- backend/dataall/db/api/dataset.py | 60 +------------------ .../datasets/services/dataset_location.py | 57 ++++++++++++++++++ backend/dataall/searchproxy/indexers.py | 3 +- 4 files changed, 65 insertions(+), 62 deletions(-) diff --git a/backend/dataall/api/Objects/Dataset/resolvers.py b/backend/dataall/api/Objects/Dataset/resolvers.py index fcbc2d6f3..63b9a47b4 100644 --- a/backend/dataall/api/Objects/Dataset/resolvers.py +++ b/backend/dataall/api/Objects/Dataset/resolvers.py @@ -18,6 +18,7 @@ from ....db.api.organization import Organization from dataall.searchproxy import indexers from dataall.searchproxy.indexers import DatasetIndexer +from ....modules.datasets.services.dataset_location import DatasetStorageLocationService log = logging.getLogger(__name__) @@ -160,7 +161,7 @@ def list_locations(context, source: models.Dataset, filter: dict = None): if not filter: filter = {'page': 1, 'pageSize': 5} with context.engine.scoped_session() as session: - return Dataset.paginated_dataset_locations( + return DatasetStorageLocationService.paginated_dataset_locations( session=session, username=context.username, groups=context.groups, @@ -232,7 +233,7 @@ def get_dataset_statistics(context: Context, source: models.Dataset, **kwargs): return None with context.engine.scoped_session() as session: count_tables = db.api.Dataset.count_dataset_tables(session, source.datasetUri) - count_locations = db.api.Dataset.count_dataset_locations( + count_locations = DatasetStorageLocationService.count_dataset_locations( session, source.datasetUri ) count_upvotes = db.api.Vote.count_upvotes( @@ -557,7 +558,7 @@ def delete_dataset( for uri in tables: indexers.delete_doc(es=context.es, doc_id=uri) - folders = [f.locationUri for f in Dataset.get_dataset_folders(session, datasetUri)] + folders = [f.locationUri for f in DatasetStorageLocationService.get_dataset_folders(session, datasetUri)] for uri in folders: indexers.delete_doc(es=context.es, doc_id=uri) diff --git a/backend/dataall/db/api/dataset.py b/backend/dataall/db/api/dataset.py index f78d92eae..27b5f59d0 100644 --- a/backend/dataall/db/api/dataset.py +++ b/backend/dataall/db/api/dataset.py @@ -16,11 +16,11 @@ from . import Organization from .. import models, api, exceptions, permissions, paginate from ..models.Enums import Language, ConfidentialityClassification +from ...modules.datasets.services.dataset_location import DatasetStorageLocationService from ...utils.naming_convention import ( NamingConventionService, NamingConventionPattern, ) -from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @@ -263,30 +263,6 @@ def paginated_user_datasets( page_size=data.get('pageSize', 10), ).to_dict() - @staticmethod - def paginated_dataset_locations( - session, username, groups, uri, data=None, check_perm=None - ) -> dict: - query = session.query(DatasetStorageLocation).filter( - DatasetStorageLocation.datasetUri == uri - ) - if data and data.get('term'): - query = query.filter( - or_( - *[ - DatasetStorageLocation.name.ilike( - '%' + data.get('term') + '%' - ), - DatasetStorageLocation.S3Prefix.ilike( - '%' + data.get('term') + '%' - ), - ] - ) - ) - return paginate( - query=query, page_size=data.get('pageSize', 10), page=data.get('page', 1) - ).to_dict() - @staticmethod def paginated_dataset_tables( session, username, groups, uri, data=None, check_perm=None @@ -486,15 +462,6 @@ def get_dataset_tables(session, dataset_uri): .all() ) - @staticmethod - def get_dataset_folders(session, dataset_uri): - """return the dataset folders""" - return ( - session.query(DatasetStorageLocation) - .filter(DatasetStorageLocation.datasetUri == dataset_uri) - .all() - ) - @staticmethod def query_dataset_shares(session, dataset_uri) -> Query: return session.query(models.ShareObject).filter( @@ -549,7 +516,7 @@ def delete_dataset( Dataset._delete_dataset_shares_with_no_shared_items(session, uri) Dataset._delete_dataset_term_links(session, uri) Dataset._delete_dataset_tables(session, dataset.datasetUri) - Dataset._delete_dataset_locations(session, dataset.datasetUri) + DatasetStorageLocationService.delete_dataset_locations(session, dataset.datasetUri) KeyValueTag.delete_key_value_tags(session, dataset.datasetUri, 'dataset') Vote.delete_votes(session, dataset.datasetUri, 'dataset') session.delete(dataset) @@ -632,21 +599,6 @@ def _delete_dataset_tables(session, dataset_uri) -> bool: table.deleted = datetime.now() return tables - @staticmethod - def _delete_dataset_locations(session, dataset_uri) -> bool: - locations = ( - session.query(DatasetStorageLocation) - .filter( - and_( - DatasetStorageLocation.datasetUri == dataset_uri, - ) - ) - .all() - ) - for location in locations: - session.delete(location) - return True - @staticmethod def list_all_datasets(session) -> [models.Dataset]: return session.query(models.Dataset).all() @@ -672,11 +624,3 @@ def count_dataset_tables(session, dataset_uri): .filter(models.DatasetTable.datasetUri == dataset_uri) .count() ) - - @staticmethod - def count_dataset_locations(session, dataset_uri): - return ( - session.query(DatasetStorageLocation) - .filter(DatasetStorageLocation.datasetUri == dataset_uri) - .count() - ) diff --git a/backend/dataall/modules/datasets/services/dataset_location.py b/backend/dataall/modules/datasets/services/dataset_location.py index 640f0a037..4d82ec2d3 100644 --- a/backend/dataall/modules/datasets/services/dataset_location.py +++ b/backend/dataall/modules/datasets/services/dataset_location.py @@ -203,3 +203,60 @@ def get_location_by_s3_prefix(session, s3_prefix, accountid, region): else: logging.info(f'Found location {location.locationUri}|{location.S3Prefix}') return location + + @staticmethod + def count_dataset_locations(session, dataset_uri): + return ( + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == dataset_uri) + .count() + ) + + @staticmethod + def delete_dataset_locations(session, dataset_uri) -> bool: + locations = ( + session.query(DatasetStorageLocation) + .filter( + and_( + DatasetStorageLocation.datasetUri == dataset_uri, + ) + ) + .all() + ) + for location in locations: + session.delete(location) + return True + + @staticmethod + def get_dataset_folders(session, dataset_uri): + """return the dataset folders""" + return ( + session.query(DatasetStorageLocation) + .filter(DatasetStorageLocation.datasetUri == dataset_uri) + .all() + ) + + @staticmethod + def paginated_dataset_locations( + session, username, groups, uri, data=None, check_perm=None + ) -> dict: + query = session.query(DatasetStorageLocation).filter( + DatasetStorageLocation.datasetUri == uri + ) + if data and data.get('term'): + query = query.filter( + or_( + *[ + DatasetStorageLocation.name.ilike( + '%' + data.get('term') + '%' + ), + DatasetStorageLocation.S3Prefix.ilike( + '%' + data.get('term') + '%' + ), + ] + ) + ) + return paginate( + query=query, page_size=data.get('pageSize', 10), page=data.get('page', 1) + ).to_dict() + diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 601945509..f157deae2 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -5,6 +5,7 @@ from .. import db from ..db import models from dataall.searchproxy.upsert import BaseIndexer +from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService log = logging.getLogger(__name__) @@ -47,7 +48,7 @@ def upsert(cls, session, dataset_uri: str): .first() ) count_tables = db.api.Dataset.count_dataset_tables(session, dataset_uri) - count_folders = db.api.Dataset.count_dataset_locations(session, dataset_uri) + count_folders = DatasetStorageLocationService.count_dataset_locations(session, dataset_uri) count_upvotes = db.api.Vote.count_upvotes( session, None, None, dataset_uri, {'targetType': 'dataset'} ) From b0e6a62a9449edfd908066bdfe5fb17b2e4ffbb2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 10:27:14 +0200 Subject: [PATCH 53/67] Renamed the service --- .../dataall/api/Objects/Dataset/resolvers.py | 8 ++++---- backend/dataall/db/api/dataset.py | 4 ++-- .../api/storage_location/resolvers.py | 20 +++++++++---------- .../datasets/handlers/s3_location_handler.py | 4 ++-- .../datasets/services/dataset_location.py | 8 ++++---- .../datasets/tasks/subscription_service.py | 4 ++-- backend/dataall/searchproxy/indexers.py | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/backend/dataall/api/Objects/Dataset/resolvers.py b/backend/dataall/api/Objects/Dataset/resolvers.py index 63b9a47b4..1b522cd94 100644 --- a/backend/dataall/api/Objects/Dataset/resolvers.py +++ b/backend/dataall/api/Objects/Dataset/resolvers.py @@ -18,7 +18,7 @@ from ....db.api.organization import Organization from dataall.searchproxy import indexers from dataall.searchproxy.indexers import DatasetIndexer -from ....modules.datasets.services.dataset_location import DatasetStorageLocationService +from ....modules.datasets.services.dataset_location import DatasetLocationService log = logging.getLogger(__name__) @@ -161,7 +161,7 @@ def list_locations(context, source: models.Dataset, filter: dict = None): if not filter: filter = {'page': 1, 'pageSize': 5} with context.engine.scoped_session() as session: - return DatasetStorageLocationService.paginated_dataset_locations( + return DatasetLocationService.paginated_dataset_locations( session=session, username=context.username, groups=context.groups, @@ -233,7 +233,7 @@ def get_dataset_statistics(context: Context, source: models.Dataset, **kwargs): return None with context.engine.scoped_session() as session: count_tables = db.api.Dataset.count_dataset_tables(session, source.datasetUri) - count_locations = DatasetStorageLocationService.count_dataset_locations( + count_locations = DatasetLocationService.count_dataset_locations( session, source.datasetUri ) count_upvotes = db.api.Vote.count_upvotes( @@ -558,7 +558,7 @@ def delete_dataset( for uri in tables: indexers.delete_doc(es=context.es, doc_id=uri) - folders = [f.locationUri for f in DatasetStorageLocationService.get_dataset_folders(session, datasetUri)] + folders = [f.locationUri for f in DatasetLocationService.get_dataset_folders(session, datasetUri)] for uri in folders: indexers.delete_doc(es=context.es, doc_id=uri) diff --git a/backend/dataall/db/api/dataset.py b/backend/dataall/db/api/dataset.py index 27b5f59d0..c9f2bd581 100644 --- a/backend/dataall/db/api/dataset.py +++ b/backend/dataall/db/api/dataset.py @@ -16,7 +16,7 @@ from . import Organization from .. import models, api, exceptions, permissions, paginate from ..models.Enums import Language, ConfidentialityClassification -from ...modules.datasets.services.dataset_location import DatasetStorageLocationService +from ...modules.datasets.services.dataset_location import DatasetLocationService from ...utils.naming_convention import ( NamingConventionService, NamingConventionPattern, @@ -516,7 +516,7 @@ def delete_dataset( Dataset._delete_dataset_shares_with_no_shared_items(session, uri) Dataset._delete_dataset_term_links(session, uri) Dataset._delete_dataset_tables(session, dataset.datasetUri) - DatasetStorageLocationService.delete_dataset_locations(session, dataset.datasetUri) + DatasetLocationService.delete_dataset_locations(session, dataset.datasetUri) KeyValueTag.delete_key_value_tags(session, dataset.datasetUri, 'dataset') Vote.delete_votes(session, dataset.datasetUri, 'dataset') session.delete(dataset) diff --git a/backend/dataall/modules/datasets/api/storage_location/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py index 1b6dcdb92..09cf4b14a 100644 --- a/backend/dataall/modules/datasets/api/storage_location/resolvers.py +++ b/backend/dataall/modules/datasets/api/storage_location/resolvers.py @@ -11,14 +11,14 @@ from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation -from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService +from dataall.modules.datasets.services.dataset_location import DatasetLocationService def create_storage_location( context, source, datasetUri: str = None, input: dict = None ): with context.engine.scoped_session() as session: - location = DatasetStorageLocationService.create_dataset_location( + location = DatasetLocationService.create_dataset_location( session=session, username=context.username, groups=context.groups, @@ -39,15 +39,15 @@ def list_dataset_locations(context, source, filter: dict = None): if not filter: filter = {} with context.engine.scoped_session() as session: - return DatasetStorageLocationService.list_dataset_locations( + return DatasetLocationService.list_dataset_locations( session=session, uri=source.datasetUri, data=filter, check_perm=True ) def get_storage_location(context, source, locationUri=None): with context.engine.scoped_session() as session: - location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) - return DatasetStorageLocationService.get_dataset_location( + location = DatasetLocationService.get_location_by_uri(session, locationUri) + return DatasetLocationService.get_dataset_location( session=session, username=context.username, groups=context.groups, @@ -61,10 +61,10 @@ def update_storage_location( context, source, locationUri: str = None, input: dict = None ): with context.engine.scoped_session() as session: - location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) + location = DatasetLocationService.get_location_by_uri(session, locationUri) input['location'] = location input['locationUri'] = location.locationUri - DatasetStorageLocationService.update_dataset_location( + DatasetLocationService.update_dataset_location( session=session, username=context.username, groups=context.groups, @@ -79,8 +79,8 @@ def update_storage_location( def remove_storage_location(context, source, locationUri: str = None): with context.engine.scoped_session() as session: - location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) - DatasetStorageLocationService.delete_dataset_location( + location = DatasetLocationService.get_location_by_uri(session, locationUri) + DatasetLocationService.delete_dataset_location( session=session, username=context.username, groups=context.groups, @@ -102,7 +102,7 @@ def resolve_dataset(context, source: DatasetStorageLocation, **kwargs): def publish_location_update(context: Context, source, locationUri: str = None): with context.engine.scoped_session() as session: - location = DatasetStorageLocationService.get_location_by_uri(session, locationUri) + location = DatasetLocationService.get_location_by_uri(session, locationUri) ResourcePolicy.check_user_resource_permission( session=session, username=context.username, diff --git a/backend/dataall/modules/datasets/handlers/s3_location_handler.py b/backend/dataall/modules/datasets/handlers/s3_location_handler.py index 431a4cecd..ba8cf6eda 100644 --- a/backend/dataall/modules/datasets/handlers/s3_location_handler.py +++ b/backend/dataall/modules/datasets/handlers/s3_location_handler.py @@ -3,7 +3,7 @@ from dataall.aws.handlers.service_handlers import Worker from dataall.aws.handlers.sts import SessionHelper from dataall.db import models -from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService +from dataall.modules.datasets.services.dataset_location import DatasetLocationService log = logging.getLogger(__name__) @@ -20,7 +20,7 @@ def client(account_id: str, region: str, client_type: str): @Worker.handler(path='s3.prefix.create') def create_dataset_location(engine, task: models.Task): with engine.scoped_session() as session: - location = DatasetStorageLocationService.get_location_by_uri( + location = DatasetLocationService.get_location_by_uri( session, task.targetUri ) S3DatasetLocationHandler.create_bucket_prefix(location) diff --git a/backend/dataall/modules/datasets/services/dataset_location.py b/backend/dataall/modules/datasets/services/dataset_location.py index 4d82ec2d3..d0e8f0936 100644 --- a/backend/dataall/modules/datasets/services/dataset_location.py +++ b/backend/dataall/modules/datasets/services/dataset_location.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -class DatasetStorageLocationService: +class DatasetLocationService: @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @has_resource_perm(permissions.CREATE_DATASET_FOLDER) @@ -102,7 +102,7 @@ def get_dataset_location( data: dict = None, check_perm: bool = False, ) -> DatasetStorageLocation: - return DatasetStorageLocationService.get_location_by_uri(session, data['locationUri']) + return DatasetLocationService.get_location_by_uri(session, data['locationUri']) @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @@ -118,7 +118,7 @@ def update_dataset_location( location = data.get( 'location', - DatasetStorageLocationService.get_location_by_uri(session, data['locationUri']), + DatasetLocationService.get_location_by_uri(session, data['locationUri']), ) for k in data.keys(): @@ -145,7 +145,7 @@ def delete_dataset_location( data: dict = None, check_perm: bool = False, ): - location = DatasetStorageLocationService.get_location_by_uri( + location = DatasetLocationService.get_location_by_uri( session, data['locationUri'] ) share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() diff --git a/backend/dataall/modules/datasets/tasks/subscription_service.py b/backend/dataall/modules/datasets/tasks/subscription_service.py index 94339d0f7..901865812 100644 --- a/backend/dataall/modules/datasets/tasks/subscription_service.py +++ b/backend/dataall/modules/datasets/tasks/subscription_service.py @@ -16,7 +16,7 @@ from dataall.tasks.subscriptions import poll_queues from dataall.utils import json_utils from dataall.modules.datasets.services.dataset_table import DatasetTableService -from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService +from dataall.modules.datasets.services.dataset_location import DatasetLocationService from dataall.modules.datasets.db.models import DatasetStorageLocation root = logging.getLogger() @@ -106,7 +106,7 @@ def publish_table_update_message(engine, message): @staticmethod def publish_location_update_message(session, message): location: DatasetStorageLocation = ( - DatasetStorageLocationService.get_location_by_s3_prefix( + DatasetLocationService.get_location_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index f157deae2..12d1da90d 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -5,7 +5,7 @@ from .. import db from ..db import models from dataall.searchproxy.upsert import BaseIndexer -from dataall.modules.datasets.services.dataset_location import DatasetStorageLocationService +from dataall.modules.datasets.services.dataset_location import DatasetLocationService log = logging.getLogger(__name__) @@ -48,7 +48,7 @@ def upsert(cls, session, dataset_uri: str): .first() ) count_tables = db.api.Dataset.count_dataset_tables(session, dataset_uri) - count_folders = DatasetStorageLocationService.count_dataset_locations(session, dataset_uri) + count_folders = DatasetLocationService.count_dataset_locations(session, dataset_uri) count_upvotes = db.api.Vote.count_upvotes( session, None, None, dataset_uri, {'targetType': 'dataset'} ) From 27c6d79559460090732a3f27f756f87d58a9e177 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 10:39:04 +0200 Subject: [PATCH 54/67] Moved DatasetIndexer to modules --- .../dataall/api/Objects/Dataset/resolvers.py | 4 +- .../dataall/api/Objects/Glossary/registry.py | 10 +-- backend/dataall/api/Objects/Vote/resolvers.py | 3 +- backend/dataall/modules/datasets/__init__.py | 9 ++ .../datasets/indexers/dataset_indexer.py | 82 +++++++++++++++++++ .../datasets/indexers/location_indexer.py | 2 +- backend/dataall/searchproxy/indexers.py | 78 +----------------- tests/api/conftest.py | 2 +- .../modules/notebooks/test_notebook_stack.py | 2 +- .../notebooks/test_sagemaker_notebook.py | 1 - tests/searchproxy/test_indexers.py | 6 +- tests/tasks/test_catalog_indexer.py | 2 +- 12 files changed, 104 insertions(+), 97 deletions(-) create mode 100644 backend/dataall/modules/datasets/indexers/dataset_indexer.py diff --git a/backend/dataall/api/Objects/Dataset/resolvers.py b/backend/dataall/api/Objects/Dataset/resolvers.py index 1b522cd94..2acdfa1fc 100644 --- a/backend/dataall/api/Objects/Dataset/resolvers.py +++ b/backend/dataall/api/Objects/Dataset/resolvers.py @@ -17,8 +17,8 @@ from ....db.api import Dataset, Environment, ShareObject, ResourcePolicy from ....db.api.organization import Organization from dataall.searchproxy import indexers -from dataall.searchproxy.indexers import DatasetIndexer -from ....modules.datasets.services.dataset_location import DatasetLocationService +from dataall.modules.datasets.services.dataset_location import DatasetLocationService +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer log = logging.getLogger(__name__) diff --git a/backend/dataall/api/Objects/Glossary/registry.py b/backend/dataall/api/Objects/Glossary/registry.py index cb82bf208..147f97a0e 100644 --- a/backend/dataall/api/Objects/Glossary/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -6,7 +6,7 @@ from dataall.api import gql from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models -from dataall.searchproxy.indexers import DashboardIndexer, DatasetTableIndexer, DatasetIndexer +from dataall.searchproxy.indexers import DashboardIndexer, DatasetTableIndexer from dataall.searchproxy.upsert import BaseIndexer @@ -74,10 +74,4 @@ def reindex(cls, session, es: OpenSearch, target_type: str, target_uri: str): object_type="DatasetTable", model=models.DatasetTable, reindexer=DatasetTableIndexer -)) -GlossaryRegistry.register(GlossaryDefinition( - target_type="Dataset", - object_type="Dataset", - model=models.Dataset, - reindexer=DatasetIndexer -)) +)) \ No newline at end of file diff --git a/backend/dataall/api/Objects/Vote/resolvers.py b/backend/dataall/api/Objects/Vote/resolvers.py index 34dcd9f05..42f5c20f5 100644 --- a/backend/dataall/api/Objects/Vote/resolvers.py +++ b/backend/dataall/api/Objects/Vote/resolvers.py @@ -1,6 +1,7 @@ from .... import db from ....api.context import Context -from dataall.searchproxy.indexers import DatasetIndexer, DashboardIndexer +from dataall.searchproxy.indexers import DashboardIndexer +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer def count_upvotes( diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index b976764ce..03cd58cdc 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -4,7 +4,9 @@ from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition +from dataall.db import models from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.modules.loader import ModuleInterface, ImportMode @@ -32,6 +34,13 @@ def __init__(self): reindexer=DatasetLocationIndexer )) + GlossaryRegistry.register(GlossaryDefinition( + target_type="Dataset", + object_type="Dataset", + model=models.Dataset, + reindexer=DatasetIndexer + )) + log.info("API of datasets has been imported") diff --git a/backend/dataall/modules/datasets/indexers/dataset_indexer.py b/backend/dataall/modules/datasets/indexers/dataset_indexer.py new file mode 100644 index 000000000..8cb0b7873 --- /dev/null +++ b/backend/dataall/modules/datasets/indexers/dataset_indexer.py @@ -0,0 +1,82 @@ +"""Indexes Datasets in OpenSearch""" + +from dataall import db +from dataall.db import models +from dataall.modules.datasets.services.dataset_location import DatasetLocationService +from dataall.searchproxy.upsert import BaseIndexer + + +class DatasetIndexer(BaseIndexer): + + @classmethod + def upsert(cls, session, dataset_uri: str): + dataset = ( + session.query( + models.Dataset.datasetUri.label('datasetUri'), + models.Dataset.name.label('name'), + models.Dataset.owner.label('owner'), + models.Dataset.label.label('label'), + models.Dataset.description.label('description'), + models.Dataset.confidentiality.label('classification'), + models.Dataset.tags.label('tags'), + models.Dataset.topics.label('topics'), + models.Dataset.region.label('region'), + models.Organization.organizationUri.label('orgUri'), + models.Organization.name.label('orgName'), + models.Environment.environmentUri.label('envUri'), + models.Environment.name.label('envName'), + models.Dataset.SamlAdminGroupName.label('admins'), + models.Dataset.GlueDatabaseName.label('database'), + models.Dataset.S3BucketName.label('source'), + models.Dataset.created, + models.Dataset.updated, + models.Dataset.deleted, + ) + .join( + models.Organization, + models.Dataset.organizationUri == models.Organization.organizationUri, + ) + .join( + models.Environment, + models.Dataset.environmentUri == models.Environment.environmentUri, + ) + .filter(models.Dataset.datasetUri == dataset_uri) + .first() + ) + count_tables = db.api.Dataset.count_dataset_tables(session, dataset_uri) + count_folders = DatasetLocationService.count_dataset_locations(session, dataset_uri) + count_upvotes = db.api.Vote.count_upvotes( + session, None, None, dataset_uri, {'targetType': 'dataset'} + ) + + if dataset: + glossary = BaseIndexer._get_target_glossary_terms(session, dataset_uri) + BaseIndexer._index( + doc_id=dataset_uri, + doc={ + 'name': dataset.name, + 'owner': dataset.owner, + 'label': dataset.label, + 'admins': dataset.admins, + 'database': dataset.database, + 'source': dataset.source, + 'resourceKind': 'dataset', + 'description': dataset.description, + 'classification': dataset.classification, + 'tags': [t.replace('-', '') for t in dataset.tags or []], + 'topics': dataset.topics, + 'region': dataset.region.replace('-', ''), + 'environmentUri': dataset.envUri, + 'environmentName': dataset.envName, + 'organizationUri': dataset.orgUri, + 'organizationName': dataset.orgName, + 'created': dataset.created, + 'updated': dataset.updated, + 'deleted': dataset.deleted, + 'glossary': glossary, + 'tables': count_tables, + 'folders': count_folders, + 'upvotes': count_upvotes, + }, + ) + return dataset diff --git a/backend/dataall/modules/datasets/indexers/location_indexer.py b/backend/dataall/modules/datasets/indexers/location_indexer.py index 9a3147e12..72495b51c 100644 --- a/backend/dataall/modules/datasets/indexers/location_indexer.py +++ b/backend/dataall/modules/datasets/indexers/location_indexer.py @@ -2,7 +2,7 @@ from dataall.modules.datasets.db.models import DatasetStorageLocation from dataall.db import models -from dataall.searchproxy.indexers import DatasetIndexer +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer from dataall.searchproxy.upsert import BaseIndexer diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 12d1da90d..fa91cb4eb 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -5,87 +5,11 @@ from .. import db from ..db import models from dataall.searchproxy.upsert import BaseIndexer -from dataall.modules.datasets.services.dataset_location import DatasetLocationService +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer log = logging.getLogger(__name__) -class DatasetIndexer(BaseIndexer): - - @classmethod - def upsert(cls, session, dataset_uri: str): - dataset = ( - session.query( - models.Dataset.datasetUri.label('datasetUri'), - models.Dataset.name.label('name'), - models.Dataset.owner.label('owner'), - models.Dataset.label.label('label'), - models.Dataset.description.label('description'), - models.Dataset.confidentiality.label('classification'), - models.Dataset.tags.label('tags'), - models.Dataset.topics.label('topics'), - models.Dataset.region.label('region'), - models.Organization.organizationUri.label('orgUri'), - models.Organization.name.label('orgName'), - models.Environment.environmentUri.label('envUri'), - models.Environment.name.label('envName'), - models.Dataset.SamlAdminGroupName.label('admins'), - models.Dataset.GlueDatabaseName.label('database'), - models.Dataset.S3BucketName.label('source'), - models.Dataset.created, - models.Dataset.updated, - models.Dataset.deleted, - ) - .join( - models.Organization, - models.Dataset.organizationUri == models.Organization.organizationUri, - ) - .join( - models.Environment, - models.Dataset.environmentUri == models.Environment.environmentUri, - ) - .filter(models.Dataset.datasetUri == dataset_uri) - .first() - ) - count_tables = db.api.Dataset.count_dataset_tables(session, dataset_uri) - count_folders = DatasetLocationService.count_dataset_locations(session, dataset_uri) - count_upvotes = db.api.Vote.count_upvotes( - session, None, None, dataset_uri, {'targetType': 'dataset'} - ) - - if dataset: - glossary = BaseIndexer._get_target_glossary_terms(session, dataset_uri) - BaseIndexer._index( - doc_id=dataset_uri, - doc={ - 'name': dataset.name, - 'owner': dataset.owner, - 'label': dataset.label, - 'admins': dataset.admins, - 'database': dataset.database, - 'source': dataset.source, - 'resourceKind': 'dataset', - 'description': dataset.description, - 'classification': dataset.classification, - 'tags': [t.replace('-', '') for t in dataset.tags or []], - 'topics': dataset.topics, - 'region': dataset.region.replace('-', ''), - 'environmentUri': dataset.envUri, - 'environmentName': dataset.envName, - 'organizationUri': dataset.orgUri, - 'organizationName': dataset.orgName, - 'created': dataset.created, - 'updated': dataset.updated, - 'deleted': dataset.deleted, - 'glossary': glossary, - 'tables': count_tables, - 'folders': count_folders, - 'upvotes': count_upvotes, - }, - ) - return dataset - - class DatasetTableIndexer(BaseIndexer): @classmethod diff --git a/tests/api/conftest.py b/tests/api/conftest.py index f61ad46ef..9eb50e050 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -30,7 +30,7 @@ def patch_es(module_mocker): module_mocker.patch('dataall.searchproxy.search', return_value={}) module_mocker.patch('dataall.searchproxy.upsert', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.upsert_dataset_tables', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.DatasetIndexer.upsert', return_value={}) + module_mocker.patch('dataall.modules.datasets.indexers.dataset_indexer.DatasetIndexer.upsert', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.DatasetTableIndexer.upsert', return_value={}) module_mocker.patch( 'dataall.modules.datasets.indexers.location_indexer.DatasetLocationIndexer.upsert', diff --git a/tests/modules/notebooks/test_notebook_stack.py b/tests/modules/notebooks/test_notebook_stack.py index fc65e9af4..1c41c46c6 100644 --- a/tests/modules/notebooks/test_notebook_stack.py +++ b/tests/modules/notebooks/test_notebook_stack.py @@ -15,4 +15,4 @@ def test_notebook_stack(client, sgm_notebook, group): username="alice", groups=[group.name], ) - assert response.data.updateStack.targetUri == sgm_notebook.notebookUri \ No newline at end of file + assert response.data.updateStack.targetUri == sgm_notebook.notebookUri diff --git a/tests/modules/notebooks/test_sagemaker_notebook.py b/tests/modules/notebooks/test_sagemaker_notebook.py index 5fd4e4d16..8b2aa9792 100644 --- a/tests/modules/notebooks/test_sagemaker_notebook.py +++ b/tests/modules/notebooks/test_sagemaker_notebook.py @@ -12,7 +12,6 @@ def get_notebook_instance_status(self): return "INSERVICE" - @pytest.fixture(scope='module') def org1(org, user, group, tenant): org1 = org('testorg', user.userName, group.name) diff --git a/tests/searchproxy/test_indexers.py b/tests/searchproxy/test_indexers.py index eda5a7dd7..ffc0370c1 100644 --- a/tests/searchproxy/test_indexers.py +++ b/tests/searchproxy/test_indexers.py @@ -6,10 +6,8 @@ from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation -from dataall.searchproxy.indexers import ( - DatasetIndexer, - DatasetTableIndexer, -) +from dataall.searchproxy.indexers import DatasetTableIndexer +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer @pytest.fixture(scope='module', autouse=True) diff --git a/tests/tasks/test_catalog_indexer.py b/tests/tasks/test_catalog_indexer.py index d6e73e4c6..f8901933b 100644 --- a/tests/tasks/test_catalog_indexer.py +++ b/tests/tasks/test_catalog_indexer.py @@ -86,7 +86,7 @@ def test_catalog_indexer(db, org, env, sync_dataset, table, mocker): 'dataall.searchproxy.indexers.upsert_dataset_tables', return_value=[table] ) mocker.patch( - 'dataall.searchproxy.indexers.DatasetIndexer.upsert', return_value=sync_dataset + 'dataall.modules.datasets.indexers.dataset_indexer.DatasetIndexer.upsert', return_value=sync_dataset ) indexed_objects_counter = dataall.tasks.catalog_indexer.index_objects( engine=db, es=True From 0e730ac3945b53f4ee3cedaf2cd819d34e8d80b6 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 10:50:08 +0200 Subject: [PATCH 55/67] Created a dataset repository. There is a few of circular imports. It's a first attempt to solve it --- backend/dataall/db/api/dataset.py | 6 ++---- .../modules/datasets/db/dataset_repository.py | 13 +++++++++++++ .../modules/datasets/services/dataset_location.py | 4 ++-- 3 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 backend/dataall/modules/datasets/db/dataset_repository.py diff --git a/backend/dataall/db/api/dataset.py b/backend/dataall/db/api/dataset.py index c9f2bd581..3c2c8de10 100644 --- a/backend/dataall/db/api/dataset.py +++ b/backend/dataall/db/api/dataset.py @@ -16,6 +16,7 @@ from . import Organization from .. import models, api, exceptions, permissions, paginate from ..models.Enums import Language, ConfidentialityClassification +from ...modules.datasets.db.dataset_repository import DatasetRepository from ...modules.datasets.services.dataset_location import DatasetLocationService from ...utils.naming_convention import ( NamingConventionService, @@ -210,10 +211,7 @@ def get_dataset( @staticmethod def get_dataset_by_uri(session, dataset_uri) -> models.Dataset: - dataset: Dataset = session.query(models.Dataset).get(dataset_uri) - if not dataset: - raise exceptions.ObjectNotFound('Dataset', dataset_uri) - return dataset + return DatasetRepository.get_dataset_by_uri(session, dataset_uri) @staticmethod def query_user_datasets(session, username, groups, filter) -> Query: diff --git a/backend/dataall/modules/datasets/db/dataset_repository.py b/backend/dataall/modules/datasets/db/dataset_repository.py new file mode 100644 index 000000000..d58c3a7a1 --- /dev/null +++ b/backend/dataall/modules/datasets/db/dataset_repository.py @@ -0,0 +1,13 @@ +from dataall.db import exceptions +from dataall.db.models import Dataset + + +class DatasetRepository: + """DAO layer for Datasets""" + + @staticmethod + def get_dataset_by_uri(session, dataset_uri) -> Dataset: + dataset: Dataset = session.query(Dataset).get(dataset_uri) + if not dataset: + raise exceptions.ObjectNotFound('Dataset', dataset_uri) + return dataset diff --git a/backend/dataall/modules/datasets/services/dataset_location.py b/backend/dataall/modules/datasets/services/dataset_location.py index d0e8f0936..f1d8b5eaf 100644 --- a/backend/dataall/modules/datasets/services/dataset_location.py +++ b/backend/dataall/modules/datasets/services/dataset_location.py @@ -5,7 +5,7 @@ from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary from dataall.db import models, api, paginate, permissions, exceptions -from dataall.db.api.dataset import Dataset +from dataall.modules.datasets.db.dataset_repository import DatasetRepository from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @@ -23,7 +23,7 @@ def create_dataset_location( data: dict = None, check_perm: bool = False, ) -> DatasetStorageLocation: - dataset = Dataset.get_dataset_by_uri(session, uri) + dataset = DatasetRepository.get_dataset_by_uri(session, uri) exists = ( session.query(DatasetStorageLocation) .filter( From 9ac79644614a6f9bf2fb5b9d003abc07dc07569f Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 11:04:38 +0200 Subject: [PATCH 56/67] Moved DatasetTableIndexer --- .../dataall/api/Objects/Dataset/resolvers.py | 5 +- .../api/Objects/DatasetTable/resolvers.py | 2 +- .../datasets/indexers/table_indexer.py | 98 +++++++++++++++++++ .../modules/datasets/tasks/tables_syncer.py | 3 +- backend/dataall/searchproxy/__init__.py | 2 - backend/dataall/searchproxy/indexers.py | 93 ------------------ backend/dataall/tasks/catalog_indexer.py | 5 +- tests/api/conftest.py | 5 +- tests/searchproxy/test_indexers.py | 6 +- tests/tasks/test_catalog_indexer.py | 3 +- 10 files changed, 116 insertions(+), 106 deletions(-) create mode 100644 backend/dataall/modules/datasets/indexers/table_indexer.py diff --git a/backend/dataall/api/Objects/Dataset/resolvers.py b/backend/dataall/api/Objects/Dataset/resolvers.py index 2acdfa1fc..79e306c9e 100644 --- a/backend/dataall/api/Objects/Dataset/resolvers.py +++ b/backend/dataall/api/Objects/Dataset/resolvers.py @@ -19,6 +19,7 @@ from dataall.searchproxy import indexers from dataall.modules.datasets.services.dataset_location import DatasetLocationService from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer +from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer log = logging.getLogger(__name__) @@ -324,8 +325,8 @@ def sync_tables(context: Context, source, datasetUri: str = None): session.add(task) Worker.process(engine=context.engine, task_ids=[task.taskUri], save_response=False) with context.engine.scoped_session() as session: - indexers.upsert_dataset_tables( - session=session, es=context.es, datasetUri=dataset.datasetUri + DatasetTableIndexer.upsert_all( + session=session, dataset_uri=dataset.datasetUri ) indexers.remove_deleted_tables( session=session, es=context.es, datasetUri=dataset.datasetUri diff --git a/backend/dataall/api/Objects/DatasetTable/resolvers.py b/backend/dataall/api/Objects/DatasetTable/resolvers.py index 567985348..7df3d8cba 100644 --- a/backend/dataall/api/Objects/DatasetTable/resolvers.py +++ b/backend/dataall/api/Objects/DatasetTable/resolvers.py @@ -13,7 +13,7 @@ from ....db.api import ResourcePolicy, Glossary from ....searchproxy import indexers from ....utils import json_utils -from dataall.searchproxy.indexers import DatasetTableIndexer +from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) diff --git a/backend/dataall/modules/datasets/indexers/table_indexer.py b/backend/dataall/modules/datasets/indexers/table_indexer.py new file mode 100644 index 000000000..1eab70a87 --- /dev/null +++ b/backend/dataall/modules/datasets/indexers/table_indexer.py @@ -0,0 +1,98 @@ +"""Indexes DatasetTable in OpenSearch""" +from operator import and_ + +from dataall.db import models +from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer +from dataall.searchproxy.upsert import BaseIndexer + + +class DatasetTableIndexer(BaseIndexer): + + @classmethod + def upsert(cls, session, table_uri: str): + table = ( + session.query( + models.DatasetTable.datasetUri.label('datasetUri'), + models.DatasetTable.tableUri.label('uri'), + models.DatasetTable.name.label('name'), + models.DatasetTable.owner.label('owner'), + models.DatasetTable.label.label('label'), + models.DatasetTable.description.label('description'), + models.Dataset.confidentiality.label('classification'), + models.DatasetTable.tags.label('tags'), + models.Dataset.topics.label('topics'), + models.Dataset.region.label('region'), + models.Organization.organizationUri.label('orgUri'), + models.Organization.name.label('orgName'), + models.Environment.environmentUri.label('envUri'), + models.Environment.name.label('envName'), + models.Dataset.SamlAdminGroupName.label('admins'), + models.Dataset.GlueDatabaseName.label('database'), + models.Dataset.S3BucketName.label('source'), + models.DatasetTable.created, + models.DatasetTable.updated, + models.DatasetTable.deleted, + ) + .join( + models.Dataset, + models.Dataset.datasetUri == models.DatasetTable.datasetUri, + ) + .join( + models.Organization, + models.Dataset.organizationUri == models.Organization.organizationUri, + ) + .join( + models.Environment, + models.Dataset.environmentUri == models.Environment.environmentUri, + ) + .filter(models.DatasetTable.tableUri == table_uri) + .first() + ) + + if table: + glossary = BaseIndexer._get_target_glossary_terms(session, table_uri) + tags = table.tags if table.tags else [] + BaseIndexer._index( + doc_id=table_uri, + doc={ + 'name': table.name, + 'admins': table.admins, + 'owner': table.owner, + 'label': table.label, + 'resourceKind': 'table', + 'description': table.description, + 'database': table.database, + 'source': table.source, + 'classification': table.classification, + 'tags': [t.replace('-', '') for t in tags or []], + 'topics': table.topics, + 'region': table.region.replace('-', ''), + 'datasetUri': table.datasetUri, + 'environmentUri': table.envUri, + 'environmentName': table.envName, + 'organizationUri': table.orgUri, + 'organizationName': table.orgName, + 'created': table.created, + 'updated': table.updated, + 'deleted': table.deleted, + 'glossary': glossary, + }, + ) + DatasetIndexer.upsert(session=session, dataset_uri=table.datasetUri) + return table + + @classmethod + def upsert_all(cls, session, dataset_uri: str): + tables = ( + session.query(models.DatasetTable) + .filter( + and_( + models.DatasetTable.datasetUri == dataset_uri, + models.DatasetTable.LastGlueTableStatus != 'Deleted', + ) + ) + .all() + ) + for table in tables: + DatasetTableIndexer.upsert(session=session, table_uri=table.tableUri) + return tables diff --git a/backend/dataall/modules/datasets/tasks/tables_syncer.py b/backend/dataall/modules/datasets/tasks/tables_syncer.py index 27a870d60..7ae104cc9 100644 --- a/backend/dataall/modules/datasets/tasks/tables_syncer.py +++ b/backend/dataall/modules/datasets/tasks/tables_syncer.py @@ -8,6 +8,7 @@ from dataall.aws.handlers.sts import SessionHelper from dataall.db import get_engine from dataall.db import models +from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.searchproxy import indexers from dataall.searchproxy.connect import connect from dataall.utils.alarm_service import AlarmService @@ -87,7 +88,7 @@ def sync_tables(engine, es=None): processed_tables.extend(tables) if es: - indexers.upsert_dataset_tables(session, es, dataset.datasetUri) + DatasetTableIndexer.upsert_all(session, dataset_uri=dataset.datasetUri) except Exception as e: log.error( f'Failed to sync tables for dataset ' diff --git a/backend/dataall/searchproxy/__init__.py b/backend/dataall/searchproxy/__init__.py index 8b648babe..78493adb6 100644 --- a/backend/dataall/searchproxy/__init__.py +++ b/backend/dataall/searchproxy/__init__.py @@ -1,10 +1,8 @@ from .connect import connect -from .indexers import upsert_dataset_tables from .search import run_query __all__ = [ 'connect', 'run_query', 'upsert', - 'upsert_dataset_tables', ] diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index fa91cb4eb..13ba44eea 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -5,87 +5,10 @@ from .. import db from ..db import models from dataall.searchproxy.upsert import BaseIndexer -from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer log = logging.getLogger(__name__) -class DatasetTableIndexer(BaseIndexer): - - @classmethod - def upsert(cls, session, table_uri: str): - table = ( - session.query( - models.DatasetTable.datasetUri.label('datasetUri'), - models.DatasetTable.tableUri.label('uri'), - models.DatasetTable.name.label('name'), - models.DatasetTable.owner.label('owner'), - models.DatasetTable.label.label('label'), - models.DatasetTable.description.label('description'), - models.Dataset.confidentiality.label('classification'), - models.DatasetTable.tags.label('tags'), - models.Dataset.topics.label('topics'), - models.Dataset.region.label('region'), - models.Organization.organizationUri.label('orgUri'), - models.Organization.name.label('orgName'), - models.Environment.environmentUri.label('envUri'), - models.Environment.name.label('envName'), - models.Dataset.SamlAdminGroupName.label('admins'), - models.Dataset.GlueDatabaseName.label('database'), - models.Dataset.S3BucketName.label('source'), - models.DatasetTable.created, - models.DatasetTable.updated, - models.DatasetTable.deleted, - ) - .join( - models.Dataset, - models.Dataset.datasetUri == models.DatasetTable.datasetUri, - ) - .join( - models.Organization, - models.Dataset.organizationUri == models.Organization.organizationUri, - ) - .join( - models.Environment, - models.Dataset.environmentUri == models.Environment.environmentUri, - ) - .filter(models.DatasetTable.tableUri == table_uri) - .first() - ) - - if table: - glossary = BaseIndexer._get_target_glossary_terms(session, table_uri) - tags = table.tags if table.tags else [] - BaseIndexer._index( - doc_id=table_uri, - doc={ - 'name': table.name, - 'admins': table.admins, - 'owner': table.owner, - 'label': table.label, - 'resourceKind': 'table', - 'description': table.description, - 'database': table.database, - 'source': table.source, - 'classification': table.classification, - 'tags': [t.replace('-', '') for t in tags or []], - 'topics': table.topics, - 'region': table.region.replace('-', ''), - 'datasetUri': table.datasetUri, - 'environmentUri': table.envUri, - 'environmentName': table.envName, - 'organizationUri': table.orgUri, - 'organizationName': table.orgName, - 'created': table.created, - 'updated': table.updated, - 'deleted': table.deleted, - 'glossary': glossary, - }, - ) - DatasetIndexer.upsert(session=session, dataset_uri=table.datasetUri) - return table - - class DashboardIndexer(BaseIndexer): @classmethod def upsert(cls, session, dashboard_uri: str): @@ -149,22 +72,6 @@ def upsert(cls, session, dashboard_uri: str): return dashboard -def upsert_dataset_tables(session, es, datasetUri: str): - tables = ( - session.query(models.DatasetTable) - .filter( - and_( - models.DatasetTable.datasetUri == datasetUri, - models.DatasetTable.LastGlueTableStatus != 'Deleted', - ) - ) - .all() - ) - for table in tables: - DatasetTableIndexer.upsert(session=session, table_uri=table.tableUri) - return tables - - def remove_deleted_tables(session, es, datasetUri: str): tables = ( session.query(models.DatasetTable) diff --git a/backend/dataall/tasks/catalog_indexer.py b/backend/dataall/tasks/catalog_indexer.py index e3d80458e..5d32800c7 100644 --- a/backend/dataall/tasks/catalog_indexer.py +++ b/backend/dataall/tasks/catalog_indexer.py @@ -3,10 +3,11 @@ import sys from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer +from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from .. import db from ..db import get_engine, exceptions from ..db import models -from dataall.searchproxy.indexers import upsert_dataset_tables, DashboardIndexer +from dataall.searchproxy.indexers import DashboardIndexer from ..searchproxy.connect import ( connect, ) @@ -34,7 +35,7 @@ def index_objects(engine, es): log.info(f'Found {len(all_datasets)} datasets') dataset: models.Dataset for dataset in all_datasets: - tables = upsert_dataset_tables(session, es, dataset.datasetUri) + tables = DatasetTableIndexer.upsert_all(session, dataset.datasetUri) folders = DatasetLocationIndexer.upsert_all(session, dataset_uri=dataset.datasetUri) indexed_objects_counter = ( indexed_objects_counter + len(tables) + len(folders) + 1 diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 9eb50e050..619559b5c 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -29,7 +29,10 @@ def patch_es(module_mocker): module_mocker.patch('dataall.searchproxy.connect', return_value={}) module_mocker.patch('dataall.searchproxy.search', return_value={}) module_mocker.patch('dataall.searchproxy.upsert', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.upsert_dataset_tables', return_value={}) + module_mocker.patch( + 'dataall.modules.datasets.indexers.table_indexer.DatasetTableIndexer.indexers.upsert_all', + return_value={} + ) module_mocker.patch('dataall.modules.datasets.indexers.dataset_indexer.DatasetIndexer.upsert', return_value={}) module_mocker.patch('dataall.searchproxy.indexers.DatasetTableIndexer.upsert', return_value={}) module_mocker.patch( diff --git a/tests/searchproxy/test_indexers.py b/tests/searchproxy/test_indexers.py index ffc0370c1..fd31506f1 100644 --- a/tests/searchproxy/test_indexers.py +++ b/tests/searchproxy/test_indexers.py @@ -4,9 +4,9 @@ import dataall from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer +from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.searchproxy import indexers from dataall.modules.datasets.db.models import DatasetStorageLocation -from dataall.searchproxy.indexers import DatasetTableIndexer from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer @@ -152,7 +152,7 @@ def test_upsert_folder(db, dataset, env, mocker, folder): def test_upsert_tables(db, dataset, env, mocker, folder): mocker.patch('dataall.searchproxy.upsert', return_value={}) with db.scoped_session() as session: - tables = indexers.upsert_dataset_tables( - session, es={}, datasetUri=dataset.datasetUri + tables = DatasetTableIndexer.upsert_all( + session, dataset_uri=dataset.datasetUri ) assert len(tables) == 1 diff --git a/tests/tasks/test_catalog_indexer.py b/tests/tasks/test_catalog_indexer.py index f8901933b..31b0f14d4 100644 --- a/tests/tasks/test_catalog_indexer.py +++ b/tests/tasks/test_catalog_indexer.py @@ -83,7 +83,8 @@ def table(org, env, db, sync_dataset): def test_catalog_indexer(db, org, env, sync_dataset, table, mocker): mocker.patch( - 'dataall.searchproxy.indexers.upsert_dataset_tables', return_value=[table] + 'dataall.modules.datasets.indexers.table_indexer.DatasetTableIndexer.upsert_all', + return_value=[table] ) mocker.patch( 'dataall.modules.datasets.indexers.dataset_indexer.DatasetIndexer.upsert', return_value=sync_dataset From a1825ba4797bdaf8468868d675aa3bcc54193688 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 11:42:15 +0200 Subject: [PATCH 57/67] Fixed test mocking --- tests/api/conftest.py | 4 ++-- tests/api/test_dataset_location.py | 3 ++- tests/cdkproxy/test_dataset_stack.py | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 619559b5c..f959be417 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -30,11 +30,11 @@ def patch_es(module_mocker): module_mocker.patch('dataall.searchproxy.search', return_value={}) module_mocker.patch('dataall.searchproxy.upsert', return_value={}) module_mocker.patch( - 'dataall.modules.datasets.indexers.table_indexer.DatasetTableIndexer.indexers.upsert_all', + 'dataall.modules.datasets.indexers.table_indexer.DatasetTableIndexer.upsert_all', return_value={} ) module_mocker.patch('dataall.modules.datasets.indexers.dataset_indexer.DatasetIndexer.upsert', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.DatasetTableIndexer.upsert', return_value={}) + module_mocker.patch('dataall.modules.datasets.indexers.table_indexer.DatasetTableIndexer.upsert', return_value={}) module_mocker.patch( 'dataall.modules.datasets.indexers.location_indexer.DatasetLocationIndexer.upsert', return_value={} diff --git a/tests/api/test_dataset_location.py b/tests/api/test_dataset_location.py index 128d21bbb..2977a8baf 100644 --- a/tests/api/test_dataset_location.py +++ b/tests/api/test_dataset_location.py @@ -71,7 +71,8 @@ def test_get_dataset(client, dataset1, env1, user, group): def test_create_location(client, dataset1, env1, user, group, patch_es, module_mocker): module_mocker.patch( - 'dataall.aws.handlers.s3.S3.create_bucket_prefix', return_value=True + 'dataall.modules.datasets.handlers.s3_location_handler.S3DatasetLocationHandler.create_bucket_prefix', + return_value=True ) response = client.query( """ diff --git a/tests/cdkproxy/test_dataset_stack.py b/tests/cdkproxy/test_dataset_stack.py index 34f495056..a9a84fc4e 100644 --- a/tests/cdkproxy/test_dataset_stack.py +++ b/tests/cdkproxy/test_dataset_stack.py @@ -8,9 +8,9 @@ @pytest.fixture(scope='function', autouse=True) def patch_methods(mocker, db, dataset, env, org): - mocker.patch('dataall.cdkproxy.stacks.dataset.DatasetStack.get_engine', return_value=db) + mocker.patch('dataall.modules.datasets.cdk.dataset_stack.DatasetStack.get_engine', return_value=db) mocker.patch( - 'dataall.cdkproxy.stacks.dataset.DatasetStack.get_target', return_value=dataset + 'dataall.modules.datasets.cdk.dataset_stack.DatasetStack.get_target', return_value=dataset ) mocker.patch( 'dataall.aws.handlers.sts.SessionHelper.get_delegation_role_name', From d2954853f5f4d894b219d79a5f5e876e84afb669 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 11:43:01 +0200 Subject: [PATCH 58/67] Fixed circular import while half of the module is not migrate --- backend/dataall/api/Objects/Glossary/registry.py | 9 +-------- backend/dataall/modules/datasets/__init__.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/backend/dataall/api/Objects/Glossary/registry.py b/backend/dataall/api/Objects/Glossary/registry.py index 147f97a0e..36fea6cf0 100644 --- a/backend/dataall/api/Objects/Glossary/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -6,7 +6,7 @@ from dataall.api import gql from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models -from dataall.searchproxy.indexers import DashboardIndexer, DatasetTableIndexer +from dataall.searchproxy.indexers import DashboardIndexer from dataall.searchproxy.upsert import BaseIndexer @@ -67,11 +67,4 @@ def reindex(cls, session, es: OpenSearch, target_type: str, target_uri: str): object_type="Dashboard", model=models.Dashboard, reindexer=DashboardIndexer -)) - -GlossaryRegistry.register(GlossaryDefinition( - target_type="DatasetTable", - object_type="DatasetTable", - model=models.DatasetTable, - reindexer=DatasetTableIndexer )) \ No newline at end of file diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 03cd58cdc..e02a9d9bf 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,12 +2,11 @@ import logging from typing import List -from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition -from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition from dataall.db import models from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer +from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.modules.loader import ModuleInterface, ImportMode log = logging.getLogger(__name__) @@ -21,6 +20,9 @@ def is_supported(cls, modes): return ImportMode.API in modes def __init__(self): + from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition + from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition + import dataall.modules.datasets.api FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) @@ -41,6 +43,13 @@ def __init__(self): reindexer=DatasetIndexer )) + GlossaryRegistry.register(GlossaryDefinition( + target_type="DatasetTable", + object_type="DatasetTable", + model=models.DatasetTable, + reindexer=DatasetTableIndexer + )) + log.info("API of datasets has been imported") From 005a5e7201f1e5218dd5d8c4039fdcd593cde772 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 14:23:31 +0200 Subject: [PATCH 59/67] Removed not used alarms --- .../share_managers/s3_share_manager.py | 12 +--- backend/dataall/utils/alarm_service.py | 55 --------------- tests/tasks/test_s3_share_manager.py | 70 ------------------- 3 files changed, 2 insertions(+), 135 deletions(-) diff --git a/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py b/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py index f0ea4e162..30c72a60e 100644 --- a/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py +++ b/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py @@ -398,7 +398,7 @@ def delete_dataset_bucket_key_policy( json.dumps(policy) ) - def handle_share_failure(self, error: Exception) -> bool: + def handle_share_failure(self, error: Exception) -> None: """ Handles share failure by raising an alarm to alarmsTopic Returns @@ -411,12 +411,8 @@ def handle_share_failure(self, error: Exception) -> bool: f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region} ' f'due to: {error}' ) - AlarmService().trigger_folder_sharing_failure_alarm( - self.target_folder, self.share, self.target_environment - ) - return True - def handle_revoke_failure(self, error: Exception) -> bool: + def handle_revoke_failure(self, error: Exception) -> None: """ Handles share failure by raising an alarm to alarmsTopic Returns @@ -429,7 +425,3 @@ def handle_revoke_failure(self, error: Exception) -> bool: f'with target account {self.target_environment.AwsAccountId}/{self.target_environment.region} ' f'due to: {error}' ) - AlarmService().trigger_revoke_folder_sharing_failure_alarm( - self.target_folder, self.share, self.target_environment - ) - return True diff --git a/backend/dataall/utils/alarm_service.py b/backend/dataall/utils/alarm_service.py index 436d5a701..b414e1ed0 100644 --- a/backend/dataall/utils/alarm_service.py +++ b/backend/dataall/utils/alarm_service.py @@ -11,7 +11,6 @@ from ..aws.handlers.sts import SessionHelper from ..db import models -from dataall.modules.datasets.db.models import DatasetStorageLocation logger = logging.getLogger(__name__) @@ -73,60 +72,6 @@ def trigger_table_sharing_failure_alarm( """ return self.publish_message_to_alarms_topic(subject, message) - def trigger_folder_sharing_failure_alarm( - self, - folder: DatasetStorageLocation, - share: models.ShareObject, - target_environment: models.Environment, - ): - logger.info('Triggering share failure alarm...') - subject = ( - f'ALARM: DATAALL Folder {folder.S3Prefix} Sharing Failure Notification' - ) - message = f""" -You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the folder {folder.S3Prefix} with S3 Access Point. -Alarm Details: - - State Change: OK -> ALARM - - Reason for State Change: S3 Folder sharing failure - - Timestamp: {datetime.now()} - Share Source - - Dataset URI: {share.datasetUri} - - AWS Account: {folder.AWSAccountId} - - Region: {folder.region} - - S3 Bucket: {folder.S3BucketName} - - S3 Folder: {folder.S3Prefix} - Share Target - - AWS Account: {target_environment.AwsAccountId} - - Region: {target_environment.region} -""" - - def trigger_revoke_folder_sharing_failure_alarm( - self, - folder: DatasetStorageLocation, - share: models.ShareObject, - target_environment: models.Environment, - ): - logger.info('Triggering share failure alarm...') - subject = ( - f'ALARM: DATAALL Folder {folder.S3Prefix} Sharing Revoke Failure Notification' - ) - message = f""" -You are receiving this email because your DATAALL {self.envname} environment in the {self.region} region has entered the ALARM state, because it failed to share the folder {folder.S3Prefix} with S3 Access Point. -Alarm Details: - - State Change: OK -> ALARM - - Reason for State Change: S3 Folder sharing Revoke failure - - Timestamp: {datetime.now()} - Share Source - - Dataset URI: {share.datasetUri} - - AWS Account: {folder.AWSAccountId} - - Region: {folder.region} - - S3 Bucket: {folder.S3BucketName} - - S3 Folder: {folder.S3Prefix} - Share Target - - AWS Account: {target_environment.AwsAccountId} - - Region: {target_environment.region} -""" - def trigger_revoke_table_sharing_failure_alarm( self, table: models.DatasetTable, diff --git a/tests/tasks/test_s3_share_manager.py b/tests/tasks/test_s3_share_manager.py index 2841be87e..14f61fefd 100644 --- a/tests/tasks/test_s3_share_manager.py +++ b/tests/tasks/test_s3_share_manager.py @@ -1415,73 +1415,3 @@ def test_delete_dataset_bucket_key_policy_existing_policy_with_no_additional_tar # Then kms_put_key_policy_mock.assert_called() kms_put_key_policy_mock.assert_called_with(source_environment.AwsAccountId, 'eu-central-1', kms_get_key_mock.return_value, "default", json.dumps(remaining_policy)) - - -def test_handle_share_failure( - mocker, - source_environment_group: models.EnvironmentGroup, - target_environment_group: models.EnvironmentGroup, - dataset1: models.Dataset, - db, - share1: models.ShareObject, - share_item_folder1: models.ShareObjectItem, - location1: DatasetStorageLocation, - source_environment: models.Environment, - target_environment: models.Environment, -): - # Given - alarm_service_mock = mocker.patch.object(AlarmService, "trigger_folder_sharing_failure_alarm") - - with db.scoped_session() as session: - manager = S3ShareManager( - session, - dataset1, - share1, - location1, - source_environment, - target_environment, - source_environment_group, - target_environment_group, - ) - - error = Exception - # When - manager.handle_share_failure(error) - - # Then - alarm_service_mock.assert_called() - - -def test_handle_revoke_failure( - mocker, - source_environment_group: models.EnvironmentGroup, - target_environment_group: models.EnvironmentGroup, - dataset1: models.Dataset, - db, - share1: models.ShareObject, - share_item_folder1: models.ShareObjectItem, - location1: DatasetStorageLocation, - source_environment: models.Environment, - target_environment: models.Environment, -): - # Given - alarm_service_mock = mocker.patch.object(AlarmService, "trigger_revoke_folder_sharing_failure_alarm") - - with db.scoped_session() as session: - manager = S3ShareManager( - session, - dataset1, - share1, - location1, - source_environment, - target_environment, - source_environment_group, - target_environment_group, - ) - - error = Exception - # When - manager.handle_revoke_failure(error) - - # Then - alarm_service_mock.assert_called() From 0fd7c02f4498e1ab6a22610f64448411e84e7ecc Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 21 Apr 2023 10:19:15 +0200 Subject: [PATCH 60/67] Moved dataset table GraphQL api in modules --- backend/dataall/api/Objects/__init__.py | 1 - .../dataall/modules/datasets/api/__init__.py | 5 +++-- .../datasets/api/table}/__init__.py | 2 +- .../datasets/api/table}/input_types.py | 4 ++-- .../datasets/api/table}/mutations.py | 11 ++++++++--- .../datasets/api/table}/queries.py | 12 ++++++++---- .../datasets/api/table}/resolvers.py | 18 +++++++++--------- .../datasets/api/table}/schema.py | 12 +++++++++--- 8 files changed, 40 insertions(+), 25 deletions(-) rename backend/dataall/{api/Objects/DatasetTable => modules/datasets/api/table}/__init__.py (75%) rename backend/dataall/{api/Objects/DatasetTable => modules/datasets/api/table}/input_types.py (93%) rename backend/dataall/{api/Objects/DatasetTable => modules/datasets/api/table}/mutations.py (82%) rename backend/dataall/{api/Objects/DatasetTable => modules/datasets/api/table}/queries.py (81%) rename backend/dataall/{api/Objects/DatasetTable => modules/datasets/api/table}/resolvers.py (95%) rename backend/dataall/{api/Objects/DatasetTable => modules/datasets/api/table}/schema.py (94%) diff --git a/backend/dataall/api/Objects/__init__.py b/backend/dataall/api/Objects/__init__.py index 7c064fb1f..5cc73fbdf 100644 --- a/backend/dataall/api/Objects/__init__.py +++ b/backend/dataall/api/Objects/__init__.py @@ -17,7 +17,6 @@ DataPipeline, Environment, Activity, - DatasetTable, Dataset, Group, Principal, diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index 4c279340e..7fe2d06a1 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -2,7 +2,8 @@ from dataall.modules.datasets.api import ( table_column, profiling, - storage_location + storage_location, + table ) -__all__ = ["table_column", "profiling", "storage_location"] +__all__ = ["table_column", "profiling", "storage_location", "table"] diff --git a/backend/dataall/api/Objects/DatasetTable/__init__.py b/backend/dataall/modules/datasets/api/table/__init__.py similarity index 75% rename from backend/dataall/api/Objects/DatasetTable/__init__.py rename to backend/dataall/modules/datasets/api/table/__init__.py index dfa46b264..3aaba05cf 100644 --- a/backend/dataall/api/Objects/DatasetTable/__init__.py +++ b/backend/dataall/modules/datasets/api/table/__init__.py @@ -1,4 +1,4 @@ -from . import ( +from dataall.modules.datasets.api.table import ( input_types, mutations, queries, diff --git a/backend/dataall/api/Objects/DatasetTable/input_types.py b/backend/dataall/modules/datasets/api/table/input_types.py similarity index 93% rename from backend/dataall/api/Objects/DatasetTable/input_types.py rename to backend/dataall/modules/datasets/api/table/input_types.py index a5bd07998..2e6649515 100644 --- a/backend/dataall/api/Objects/DatasetTable/input_types.py +++ b/backend/dataall/modules/datasets/api/table/input_types.py @@ -1,5 +1,5 @@ -from ... import gql -from ....api.constants import SortDirection, GraphQLEnumMapper +from dataall.api import gql +from dataall.api.constants import SortDirection, GraphQLEnumMapper NewDatasetTableInput = gql.InputType( diff --git a/backend/dataall/api/Objects/DatasetTable/mutations.py b/backend/dataall/modules/datasets/api/table/mutations.py similarity index 82% rename from backend/dataall/api/Objects/DatasetTable/mutations.py rename to backend/dataall/modules/datasets/api/table/mutations.py index 532605cff..7a26a6c15 100644 --- a/backend/dataall/api/Objects/DatasetTable/mutations.py +++ b/backend/dataall/modules/datasets/api/table/mutations.py @@ -1,9 +1,14 @@ -from ... import gql -from .input_types import ( +from dataall.api import gql +from dataall.modules.datasets.api.table.input_types import ( ModifyDatasetTableInput, NewDatasetTableInput, ) -from .resolvers import * +from dataall.modules.datasets.api.table.resolvers import ( + create_table, + update_table, + delete_table, + publish_table_update +) createDatasetTable = gql.MutationField( name='createDatasetTable', diff --git a/backend/dataall/api/Objects/DatasetTable/queries.py b/backend/dataall/modules/datasets/api/table/queries.py similarity index 81% rename from backend/dataall/api/Objects/DatasetTable/queries.py rename to backend/dataall/modules/datasets/api/table/queries.py index 8f7809e62..a6d8d48cf 100644 --- a/backend/dataall/api/Objects/DatasetTable/queries.py +++ b/backend/dataall/modules/datasets/api/table/queries.py @@ -1,7 +1,11 @@ -from ... import gql -from .input_types import DatasetTableFilter -from .resolvers import * -from .schema import ( +from dataall.api import gql +from dataall.modules.datasets.api.table.input_types import DatasetTableFilter +from dataall.modules.datasets.api.table.resolvers import ( + get_table, + list_shared_tables_by_env_dataset, + preview +) +from dataall.modules.datasets.api.table.schema import ( DatasetTable, DatasetTableSearchResult, ) diff --git a/backend/dataall/api/Objects/DatasetTable/resolvers.py b/backend/dataall/modules/datasets/api/table/resolvers.py similarity index 95% rename from backend/dataall/api/Objects/DatasetTable/resolvers.py rename to backend/dataall/modules/datasets/api/table/resolvers.py index 7df3d8cba..f4d7f4ea1 100644 --- a/backend/dataall/api/Objects/DatasetTable/resolvers.py +++ b/backend/dataall/modules/datasets/api/table/resolvers.py @@ -4,15 +4,15 @@ from botocore.exceptions import ClientError from pyathena import connect -from .... import db -from ..Dataset.resolvers import get_dataset -from ....api.context import Context -from ....aws.handlers.service_handlers import Worker -from ....aws.handlers.sts import SessionHelper -from ....db import permissions, models -from ....db.api import ResourcePolicy, Glossary -from ....searchproxy import indexers -from ....utils import json_utils +from dataall import db +from dataall.api.Objects.Dataset.resolvers import get_dataset +from dataall.api.context import Context +from dataall.aws.handlers.service_handlers import Worker +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import permissions, models +from dataall.db.api import ResourcePolicy, Glossary +from dataall.searchproxy import indexers +from dataall.utils import json_utils from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.modules.datasets.services.dataset_table import DatasetTableService diff --git a/backend/dataall/api/Objects/DatasetTable/schema.py b/backend/dataall/modules/datasets/api/table/schema.py similarity index 94% rename from backend/dataall/api/Objects/DatasetTable/schema.py rename to backend/dataall/modules/datasets/api/table/schema.py index 74d413818..666bf7e35 100644 --- a/backend/dataall/api/Objects/DatasetTable/schema.py +++ b/backend/dataall/modules/datasets/api/table/schema.py @@ -1,7 +1,13 @@ from dataall.modules.datasets.api.table_column.resolvers import list_table_columns -from ... import gql -from .resolvers import * -from ...constants import GraphQLEnumMapper +from dataall.api import gql +from dataall.modules.datasets.api.table.resolvers import ( + resolve_dataset, + get_glue_table_properties, + resolve_redshift_copy_location, + resolve_glossary_terms, + resolve_redshift_copy_schema +) +from dataall.api.constants import GraphQLEnumMapper TablePermission = gql.ObjectType( name='TablePermission', From 7030c8226c6a741078ee6f0cb9c8e217be406fc4 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 21 Apr 2023 10:53:55 +0200 Subject: [PATCH 61/67] Moved DatasetTable model to modules --- backend/dataall/api/Objects/Feed/registry.py | 1 - .../api/Objects/ShareObject/resolvers.py | 4 +- backend/dataall/aws/handlers/glue.py | 3 +- backend/dataall/aws/handlers/redshift.py | 3 +- backend/dataall/db/api/dataset.py | 33 ++++++------ backend/dataall/db/api/redshift_cluster.py | 25 +++++----- backend/dataall/db/api/share_object.py | 26 +++++----- backend/dataall/db/models/DatasetTable.py | 32 ------------ backend/dataall/db/models/__init__.py | 1 - backend/dataall/modules/datasets/__init__.py | 5 +- .../modules/datasets/api/table/resolvers.py | 17 ++++--- .../datasets/api/table_column/resolvers.py | 8 +-- .../modules/datasets/cdk/dataset_stack.py | 16 +++--- backend/dataall/modules/datasets/db/models.py | 31 +++++++++++- .../datasets/handlers/glue_column_handler.py | 12 ++--- .../datasets/indexers/table_indexer.py | 31 ++++++------ .../services/dataset_profiling_service.py | 28 +++++------ .../services/dataset_share_service.py | 18 +++---- .../datasets/services/dataset_table.py | 50 +++++++++---------- .../datasets/tasks/subscription_service.py | 10 ++-- .../modules/datasets/tasks/tables_syncer.py | 5 +- backend/dataall/searchproxy/indexers.py | 7 +-- .../dataall/tasks/bucket_policy_updater.py | 20 ++++---- .../share_managers/lf_share_manager.py | 23 +++++---- .../lf_process_cross_account_share.py | 5 +- .../lf_process_same_account_share.py | 7 +-- backend/dataall/utils/alarm_service.py | 5 +- ...215e_backfill_dataset_table_permissions.py | 5 +- tests/api/conftest.py | 8 +-- tests/api/test_dataset.py | 4 +- tests/api/test_dataset_profiling.py | 12 ++--- tests/api/test_dataset_table.py | 34 ++++++------- tests/api/test_glossary.py | 6 +-- tests/api/test_share.py | 11 ++-- tests/cdkproxy/conftest.py | 5 +- tests/searchproxy/test_indexers.py | 4 +- tests/tasks/conftest.py | 8 +-- tests/tasks/test_catalog_indexer.py | 3 +- tests/tasks/test_lf_share_manager.py | 35 ++++++------- tests/tasks/test_policies.py | 3 +- tests/tasks/test_subscriptions.py | 3 +- tests/tasks/test_tables_sync.py | 9 ++-- 42 files changed, 292 insertions(+), 284 deletions(-) delete mode 100644 backend/dataall/db/models/DatasetTable.py diff --git a/backend/dataall/api/Objects/Feed/registry.py b/backend/dataall/api/Objects/Feed/registry.py index 6a01a488a..4fedd252a 100644 --- a/backend/dataall/api/Objects/Feed/registry.py +++ b/backend/dataall/api/Objects/Feed/registry.py @@ -38,5 +38,4 @@ def types(cls): FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) -FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/api/Objects/ShareObject/resolvers.py b/backend/dataall/api/Objects/ShareObject/resolvers.py index 16e4e1353..49f20fc17 100644 --- a/backend/dataall/api/Objects/ShareObject/resolvers.py +++ b/backend/dataall/api/Objects/ShareObject/resolvers.py @@ -7,7 +7,7 @@ from ....api.context import Context from ....aws.handlers.service_handlers import Worker from ....db import models -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable log = logging.getLogger(__name__) @@ -265,7 +265,7 @@ def resolve_dataset(context: Context, source: models.ShareObject, **kwargs): def union_resolver(object, *_): - if isinstance(object, models.DatasetTable): + if isinstance(object, DatasetTable): return 'DatasetTable' elif isinstance(object, DatasetStorageLocation): return 'DatasetStorageLocation' diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index e76fd4e63..c2a7ecf21 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,6 +6,7 @@ from .sts import SessionHelper from ... import db from ...db import models +from dataall.modules.datasets.db.models import DatasetTable log = logging.getLogger('aws:glue') @@ -524,7 +525,7 @@ def get_job_runs(engine, task: models.Task): @staticmethod def grant_principals_all_table_permissions( - table: models.DatasetTable, principals: [str], client=None + table: DatasetTable, principals: [str], client=None ): """ Update the table permissions on Lake Formation diff --git a/backend/dataall/aws/handlers/redshift.py b/backend/dataall/aws/handlers/redshift.py index c186d5df7..1fe6c738c 100644 --- a/backend/dataall/aws/handlers/redshift.py +++ b/backend/dataall/aws/handlers/redshift.py @@ -11,6 +11,7 @@ from ...db import models # TODO should be migrated in the redshift module from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.models import DatasetTable log = logging.getLogger(__name__) @@ -448,7 +449,7 @@ def copy_data(engine, task: models.Task): session, task.payload['datasetUri'] ) - table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( + table: DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, task.payload['tableUri'] ) diff --git a/backend/dataall/db/api/dataset.py b/backend/dataall/db/api/dataset.py index 3c2c8de10..d328dddb5 100644 --- a/backend/dataall/db/api/dataset.py +++ b/backend/dataall/db/api/dataset.py @@ -16,9 +16,10 @@ from . import Organization from .. import models, api, exceptions, permissions, paginate from ..models.Enums import Language, ConfidentialityClassification -from ...modules.datasets.db.dataset_repository import DatasetRepository -from ...modules.datasets.services.dataset_location import DatasetLocationService -from ...utils.naming_convention import ( +from dataall.modules.datasets.db.dataset_repository import DatasetRepository +from dataall.modules.datasets.db.models import DatasetTable +from dataall.modules.datasets.services.dataset_location import DatasetLocationService +from dataall.utils.naming_convention import ( NamingConventionService, NamingConventionPattern, ) @@ -266,21 +267,21 @@ def paginated_dataset_tables( session, username, groups, uri, data=None, check_perm=None ) -> dict: query = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .filter( and_( - models.DatasetTable.datasetUri == uri, - models.DatasetTable.LastGlueTableStatus != 'Deleted', + DatasetTable.datasetUri == uri, + DatasetTable.LastGlueTableStatus != 'Deleted', ) ) - .order_by(models.DatasetTable.created.desc()) + .order_by(DatasetTable.created.desc()) ) if data and data.get('term'): query = query.filter( or_( *[ - models.DatasetTable.name.ilike('%' + data.get('term') + '%'), - models.DatasetTable.GlueTableName.ilike( + DatasetTable.name.ilike('%' + data.get('term') + '%'), + DatasetTable.GlueTableName.ilike( '%' + data.get('term') + '%' ), ] @@ -379,7 +380,7 @@ def transfer_stewardship_to_new_stewards(session, dataset, new_stewards): group=new_stewards, permissions=permissions.DATASET_TABLE_READ, resource_uri=tableUri, - resource_type=models.DatasetTable.__name__, + resource_type=DatasetTable.__name__, ) dataset_shares = ( @@ -455,8 +456,8 @@ def update_glue_database_status(session, dataset_uri): def get_dataset_tables(session, dataset_uri): """return the dataset tables""" return ( - session.query(models.DatasetTable) - .filter(models.DatasetTable.datasetUri == dataset_uri) + session.query(DatasetTable) + .filter(DatasetTable.datasetUri == dataset_uri) .all() ) @@ -585,10 +586,10 @@ def _delete_dataset_term_links(session, uri): @staticmethod def _delete_dataset_tables(session, dataset_uri) -> bool: tables = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .filter( and_( - models.DatasetTable.datasetUri == dataset_uri, + DatasetTable.datasetUri == dataset_uri, ) ) .all() @@ -618,7 +619,7 @@ def get_dataset_by_bucket_name(session, bucket) -> [models.Dataset]: @staticmethod def count_dataset_tables(session, dataset_uri): return ( - session.query(models.DatasetTable) - .filter(models.DatasetTable.datasetUri == dataset_uri) + session.query(DatasetTable) + .filter(DatasetTable.datasetUri == dataset_uri) .count() ) diff --git a/backend/dataall/db/api/redshift_cluster.py b/backend/dataall/db/api/redshift_cluster.py index 31b795225..de5799180 100644 --- a/backend/dataall/db/api/redshift_cluster.py +++ b/backend/dataall/db/api/redshift_cluster.py @@ -4,11 +4,12 @@ from .. import models, api, exceptions, paginate, permissions from . import has_resource_perm, ResourcePolicy, Environment, Dataset -from ...utils.naming_convention import ( +from dataall.modules.datasets.db.models import DatasetTable +from dataall.utils.naming_convention import ( NamingConventionService, NamingConventionPattern, ) -from ...utils.slugify import slugify +from dataall.utils.slugify import slugify log = logging.getLogger(__name__) @@ -334,13 +335,13 @@ def list_available_cluster_tables( ) created = ( session.query( - models.DatasetTable.datasetUri.label('datasetUri'), - models.DatasetTable.tableUri.label('tableUri'), + DatasetTable.datasetUri.label('datasetUri'), + DatasetTable.tableUri.label('tableUri'), models.RedshiftCluster.clusterUri.label('clusterUri'), ) .join( models.Dataset, - models.DatasetTable.datasetUri == models.Dataset.datasetUri, + DatasetTable.datasetUri == models.Dataset.datasetUri, ) .filter( and_( @@ -354,8 +355,8 @@ def list_available_cluster_tables( ) ) .group_by( - models.DatasetTable.datasetUri, - models.DatasetTable.tableUri, + DatasetTable.datasetUri, + DatasetTable.tableUri, models.RedshiftCluster.clusterUri, ) ) @@ -363,10 +364,10 @@ def list_available_cluster_tables( 'all_group_tables_sub_query' ) query = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .join( all_group_tables_sub_query, - all_group_tables_sub_query.c.tableUri == models.DatasetTable.tableUri, + all_group_tables_sub_query.c.tableUri == DatasetTable.tableUri, ) .filter( models.RedshiftCluster.clusterUri == cluster.clusterUri, @@ -541,18 +542,18 @@ def list_copy_enabled_tables( session, username, groups, uri, data=None, check_perm=True ) -> [models.RedshiftClusterDatasetTable]: q = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .join( models.RedshiftClusterDatasetTable, models.RedshiftClusterDatasetTable.tableUri - == models.DatasetTable.tableUri, + == DatasetTable.tableUri, ) .filter(models.RedshiftClusterDatasetTable.clusterUri == uri) ) if data.get('term'): term = data.get('term') q = q.filter( - models.DatasetTable.label.ilike('%' + term + '%'), + DatasetTable.label.ilike('%' + term + '%'), ) return paginate( q, page=data.get('page', 1), page_size=data.get('pageSize', 20) diff --git a/backend/dataall/db/api/share_object.py b/backend/dataall/db/api/share_object.py index bd0215190..4fddda5e9 100644 --- a/backend/dataall/db/api/share_object.py +++ b/backend/dataall/db/api/share_object.py @@ -10,7 +10,7 @@ from .. import api, utils from .. import models, exceptions, permissions, paginate from ..models.Enums import ShareObjectStatus, ShareItemStatus, ShareObjectActions, ShareItemActions, ShareableType, PrincipalType -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable logger = logging.getLogger(__name__) @@ -422,7 +422,7 @@ def create_share_object( if itemType == ShareableType.StorageLocation.value: item = session.query(DatasetStorageLocation).get(itemUri) if itemType == ShareableType.Table.value: - item = session.query(models.DatasetTable).get(itemUri) + item = session.query(DatasetTable).get(itemUri) share_item = ( session.query(models.ShareObjectItem) @@ -605,7 +605,7 @@ def approve_share_object( group=share.principalId, permissions=permissions.DATASET_TABLE_READ, resource_uri=table.itemUri, - resource_type=models.DatasetTable.__name__, + resource_type=DatasetTable.__name__, ) api.Notification.notify_share_object_approval(session, username, dataset, share) @@ -717,7 +717,7 @@ def get_share_item( ShareObject.get_share_item_by_uri(session, data['shareItemUri']), ) if share_item.itemType == ShareableType.Table.value: - return session.query(models.DatasetTable).get(share_item.itemUri) + return session.query(DatasetTable).get(share_item.itemUri) if share_item.itemType == ShareableType.StorageLocation: return session.Query(DatasetStorageLocation).get(share_item.itemUri) @@ -762,7 +762,7 @@ def add_share_object_item( Share_SM.update_state(session, share, new_share_state) if itemType == ShareableType.Table.value: - item: models.DatasetTable = session.query(models.DatasetTable).get(itemUri) + item: DatasetTable = session.query(DatasetTable).get(itemUri) if item and item.region != target_environment.region: raise exceptions.UnauthorizedOperation( action=permissions.ADD_ITEM, @@ -944,10 +944,10 @@ def list_shareable_items( # marking the table as part of the shareObject tables = ( session.query( - models.DatasetTable.tableUri.label('itemUri'), + DatasetTable.tableUri.label('itemUri'), func.coalesce('DatasetTable').label('itemType'), - models.DatasetTable.GlueTableName.label('itemName'), - models.DatasetTable.description.label('description'), + DatasetTable.GlueTableName.label('itemName'), + DatasetTable.description.label('description'), models.ShareObjectItem.shareItemUri.label('shareItemUri'), models.ShareObjectItem.status.label('status'), case( @@ -959,10 +959,10 @@ def list_shareable_items( models.ShareObjectItem, and_( models.ShareObjectItem.shareUri == share.shareUri, - models.DatasetTable.tableUri == models.ShareObjectItem.itemUri, + DatasetTable.tableUri == models.ShareObjectItem.itemUri, ), ) - .filter(models.DatasetTable.datasetUri == datasetUri) + .filter(DatasetTable.datasetUri == datasetUri) ) if data: if data.get("isRevokable"): @@ -1145,7 +1145,7 @@ def update_share_item_status_batch( def find_share_item_by_table( session, share: models.ShareObject, - table: models.DatasetTable, + table: DatasetTable, ) -> models.ShareObjectItem: share_item: models.ShareObjectItem = ( session.query(models.ShareObjectItem) @@ -1247,10 +1247,10 @@ def get_share_data_items(session, share_uri, status): raise exceptions.ObjectNotFound('Share', share_uri) tables = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .join( models.ShareObjectItem, - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, + models.ShareObjectItem.itemUri == DatasetTable.tableUri, ) .join( models.ShareObject, diff --git a/backend/dataall/db/models/DatasetTable.py b/backend/dataall/db/models/DatasetTable.py deleted file mode 100644 index e97174167..000000000 --- a/backend/dataall/db/models/DatasetTable.py +++ /dev/null @@ -1,32 +0,0 @@ -from sqlalchemy import Column, String, Text -from sqlalchemy.dialects import postgresql -from sqlalchemy.orm import query_expression - -from .. import Base -from .. import Resource, utils - - -class DatasetTable(Resource, Base): - __tablename__ = 'dataset_table' - datasetUri = Column(String, nullable=False) - tableUri = Column(String, primary_key=True, default=utils.uuid('table')) - AWSAccountId = Column(String, nullable=False) - S3BucketName = Column(String, nullable=False) - S3Prefix = Column(String, nullable=False) - GlueDatabaseName = Column(String, nullable=False) - GlueTableName = Column(String, nullable=False) - GlueTableConfig = Column(Text) - GlueTableProperties = Column(postgresql.JSON, default={}) - LastGlueTableStatus = Column(String, default='InSync') - region = Column(String, default='eu-west-1') - # LastGeneratedPreviewDate= Column(DateTime, default=None) - confidentiality = Column(String, nullable=True) - userRoleForTable = query_expression() - projectPermission = query_expression() - redshiftClusterPermission = query_expression() - stage = Column(String, default='RAW') - topics = Column(postgresql.ARRAY(String), nullable=True) - confidentiality = Column(String, nullable=False, default='C1') - - def uri(self): - return self.tableUri diff --git a/backend/dataall/db/models/__init__.py b/backend/dataall/db/models/__init__.py index c288527cf..123547f8c 100644 --- a/backend/dataall/db/models/__init__.py +++ b/backend/dataall/db/models/__init__.py @@ -6,7 +6,6 @@ from .DashboardShare import DashboardShareStatus from .Dataset import Dataset from .DatasetQualityRule import DatasetQualityRule -from .DatasetTable import DatasetTable from .Environment import Environment from .EnvironmentGroup import EnvironmentGroup from .FeedMessage import FeedMessage diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index e02a9d9bf..4f8964016 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -3,7 +3,7 @@ from typing import List from dataall.db import models -from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetStorageLocation, DatasetTable from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer @@ -27,6 +27,7 @@ def __init__(self): FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) FeedRegistry.register(FeedDefinition("DatasetStorageLocation", DatasetStorageLocation)) + FeedRegistry.register(FeedDefinition("DatasetTable", DatasetTable)) GlossaryRegistry.register(GlossaryDefinition("Column", "DatasetTableColumn", DatasetTableColumn)) GlossaryRegistry.register(GlossaryDefinition( @@ -46,7 +47,7 @@ def __init__(self): GlossaryRegistry.register(GlossaryDefinition( target_type="DatasetTable", object_type="DatasetTable", - model=models.DatasetTable, + model=DatasetTable, reindexer=DatasetTableIndexer )) diff --git a/backend/dataall/modules/datasets/api/table/resolvers.py b/backend/dataall/modules/datasets/api/table/resolvers.py index f4d7f4ea1..ea16cae79 100644 --- a/backend/dataall/modules/datasets/api/table/resolvers.py +++ b/backend/dataall/modules/datasets/api/table/resolvers.py @@ -11,6 +11,7 @@ from dataall.aws.handlers.sts import SessionHelper from dataall.db import permissions, models from dataall.db.api import ResourcePolicy, Glossary +from dataall.modules.datasets.db.models import DatasetTable from dataall.searchproxy import indexers from dataall.utils import json_utils from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer @@ -104,7 +105,7 @@ def delete_table(context, source, tableUri: str = None): def preview(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( + table: DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) @@ -155,17 +156,17 @@ def preview(context, source, tableUri: str = None): return {'rows': rows, 'fields': fields} -def get_glue_table_properties(context: Context, source: models.DatasetTable, **kwargs): +def get_glue_table_properties(context: Context, source: DatasetTable, **kwargs): if not source: return None with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( + table: DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, source.tableUri ) return json_utils.to_string(table.GlueTableProperties).replace('\\', ' ') -def resolve_dataset(context, source: models.DatasetTable, **kwargs): +def resolve_dataset(context, source: DatasetTable, **kwargs): if not source: return None with context.engine.scoped_session() as session: @@ -177,7 +178,7 @@ def resolve_dataset(context, source: models.DatasetTable, **kwargs): return dataset_with_role -def resolve_glossary_terms(context: Context, source: models.DatasetTable, **kwargs): +def resolve_glossary_terms(context: Context, source: DatasetTable, **kwargs): if not source: return None with context.engine.scoped_session() as session: @@ -188,7 +189,7 @@ def resolve_glossary_terms(context: Context, source: models.DatasetTable, **kwar def publish_table_update(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( + table: DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -217,7 +218,7 @@ def publish_table_update(context: Context, source, tableUri: str = None): return True -def resolve_redshift_copy_schema(context, source: models.DatasetTable, clusterUri: str): +def resolve_redshift_copy_schema(context, source: DatasetTable, clusterUri: str): if not source: return None with context.engine.scoped_session() as session: @@ -227,7 +228,7 @@ def resolve_redshift_copy_schema(context, source: models.DatasetTable, clusterUr def resolve_redshift_copy_location( - context, source: models.DatasetTable, clusterUri: str + context, source: DatasetTable, clusterUri: str ): with context.engine.scoped_session() as session: return db.api.RedshiftCluster.get_cluster_dataset_table( diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index 8e78a042e..b27a99dd3 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -6,12 +6,12 @@ from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy from dataall.modules.datasets.services.dataset_table import DatasetTableService -from dataall.modules.datasets.db.models import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetTable def list_table_columns( context: Context, - source: models.DatasetTable, + source: DatasetTable, tableUri: str = None, filter: dict = None, ): @@ -46,7 +46,7 @@ def list_table_columns( def sync_table_columns(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( + table: DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -81,7 +81,7 @@ def update_table_column( ).get(columnUri) if not column: raise db.exceptions.ObjectNotFound('Column', columnUri) - table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( + table: DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, column.tableUri ) ResourcePolicy.check_user_resource_permission( diff --git a/backend/dataall/modules/datasets/cdk/dataset_stack.py b/backend/dataall/modules/datasets/cdk/dataset_stack.py index e99b43b0c..517b32893 100644 --- a/backend/dataall/modules/datasets/cdk/dataset_stack.py +++ b/backend/dataall/modules/datasets/cdk/dataset_stack.py @@ -28,7 +28,7 @@ from dataall.db.api import Environment from dataall.utils.cdk_nag_utils import CDKNagUtil from dataall.utils.runtime_stacks_tagging import TagsUtil -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable logger = logging.getLogger(__name__) @@ -77,17 +77,17 @@ def get_shared_tables(self) -> typing.List[models.ShareObjectItem]: with engine.scoped_session() as session: tables = ( session.query( - models.DatasetTable.GlueDatabaseName.label('GlueDatabaseName'), - models.DatasetTable.GlueTableName.label('GlueTableName'), - models.DatasetTable.AWSAccountId.label('SourceAwsAccountId'), - models.DatasetTable.region.label('SourceRegion'), + DatasetTable.GlueDatabaseName.label('GlueDatabaseName'), + DatasetTable.GlueTableName.label('GlueTableName'), + DatasetTable.AWSAccountId.label('SourceAwsAccountId'), + DatasetTable.region.label('SourceRegion'), models.Environment.AwsAccountId.label('TargetAwsAccountId'), models.Environment.region.label('TargetRegion'), ) .join( models.ShareObjectItem, and_( - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri + models.ShareObjectItem.itemUri == DatasetTable.tableUri ), ) .join( @@ -101,8 +101,8 @@ def get_shared_tables(self) -> typing.List[models.ShareObjectItem]: ) .filter( and_( - models.DatasetTable.datasetUri == self.target_uri, - models.DatasetTable.deleted.is_(None), + DatasetTable.datasetUri == self.target_uri, + DatasetTable.deleted.is_(None), models.ShareObjectItem.status.in_(self.shared_states) ) ) diff --git a/backend/dataall/modules/datasets/db/models.py b/backend/dataall/modules/datasets/db/models.py index 2dfee26ec..a25978bef 100644 --- a/backend/dataall/modules/datasets/db/models.py +++ b/backend/dataall/modules/datasets/db/models.py @@ -1,5 +1,5 @@ -from sqlalchemy import Boolean, Column, String -from sqlalchemy.dialects.postgresql import JSON +from sqlalchemy import Boolean, Column, String, Text +from sqlalchemy.dialects.postgresql import JSON, ARRAY from sqlalchemy.orm import query_expression from dataall.db import Base, Resource, utils @@ -56,3 +56,30 @@ class DatasetStorageLocation(Resource, Base): def uri(self): return self.locationUri + +class DatasetTable(Resource, Base): + __tablename__ = 'dataset_table' + datasetUri = Column(String, nullable=False) + tableUri = Column(String, primary_key=True, default=utils.uuid('table')) + AWSAccountId = Column(String, nullable=False) + S3BucketName = Column(String, nullable=False) + S3Prefix = Column(String, nullable=False) + GlueDatabaseName = Column(String, nullable=False) + GlueTableName = Column(String, nullable=False) + GlueTableConfig = Column(Text) + GlueTableProperties = Column(JSON, default={}) + LastGlueTableStatus = Column(String, default='InSync') + region = Column(String, default='eu-west-1') + # LastGeneratedPreviewDate= Column(DateTime, default=None) + confidentiality = Column(String, nullable=True) + userRoleForTable = query_expression() + projectPermission = query_expression() + redshiftClusterPermission = query_expression() + stage = Column(String, default='RAW') + topics = Column(ARRAY(String), nullable=True) + confidentiality = Column(String, nullable=False, default='C1') + + def uri(self): + return self.tableUri + + diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index df43f9dbd..f41784959 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -5,7 +5,7 @@ from dataall.aws.handlers.sts import SessionHelper from dataall.db import models from dataall.aws.handlers.service_handlers import Worker -from dataall.modules.datasets.db.models import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetTable from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -18,7 +18,7 @@ class DatasetColumnGlueHandler: @Worker.handler('glue.table.columns') def get_table_columns(engine, task: models.Task): with engine.scoped_session() as session: - dataset_table: models.DatasetTable = session.query(models.DatasetTable).get( + dataset_table: DatasetTable = session.query(DatasetTable).get( task.targetUri ) aws = SessionHelper.remote_session(dataset_table.AWSAccountId) @@ -46,12 +46,8 @@ def get_table_columns(engine, task: models.Task): @Worker.handler('glue.table.update_column') def update_table_columns(engine, task: models.Task): with engine.scoped_session() as session: - column: DatasetTableColumn = session.query( - DatasetTableColumn - ).get(task.targetUri) - table: models.DatasetTable = session.query(models.DatasetTable).get( - column.tableUri - ) + column: DatasetTableColumn = session.query(DatasetTableColumn).get(task.targetUri) + table: DatasetTable = session.query(DatasetTable).get(column.tableUri) try: aws_session = SessionHelper.remote_session(table.AWSAccountId) diff --git a/backend/dataall/modules/datasets/indexers/table_indexer.py b/backend/dataall/modules/datasets/indexers/table_indexer.py index 1eab70a87..4c96eea6d 100644 --- a/backend/dataall/modules/datasets/indexers/table_indexer.py +++ b/backend/dataall/modules/datasets/indexers/table_indexer.py @@ -2,6 +2,7 @@ from operator import and_ from dataall.db import models +from dataall.modules.datasets.db.models import DatasetTable from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer from dataall.searchproxy.upsert import BaseIndexer @@ -12,14 +13,14 @@ class DatasetTableIndexer(BaseIndexer): def upsert(cls, session, table_uri: str): table = ( session.query( - models.DatasetTable.datasetUri.label('datasetUri'), - models.DatasetTable.tableUri.label('uri'), - models.DatasetTable.name.label('name'), - models.DatasetTable.owner.label('owner'), - models.DatasetTable.label.label('label'), - models.DatasetTable.description.label('description'), + DatasetTable.datasetUri.label('datasetUri'), + DatasetTable.tableUri.label('uri'), + DatasetTable.name.label('name'), + DatasetTable.owner.label('owner'), + DatasetTable.label.label('label'), + DatasetTable.description.label('description'), models.Dataset.confidentiality.label('classification'), - models.DatasetTable.tags.label('tags'), + DatasetTable.tags.label('tags'), models.Dataset.topics.label('topics'), models.Dataset.region.label('region'), models.Organization.organizationUri.label('orgUri'), @@ -29,13 +30,13 @@ def upsert(cls, session, table_uri: str): models.Dataset.SamlAdminGroupName.label('admins'), models.Dataset.GlueDatabaseName.label('database'), models.Dataset.S3BucketName.label('source'), - models.DatasetTable.created, - models.DatasetTable.updated, - models.DatasetTable.deleted, + DatasetTable.created, + DatasetTable.updated, + DatasetTable.deleted, ) .join( models.Dataset, - models.Dataset.datasetUri == models.DatasetTable.datasetUri, + models.Dataset.datasetUri == DatasetTable.datasetUri, ) .join( models.Organization, @@ -45,7 +46,7 @@ def upsert(cls, session, table_uri: str): models.Environment, models.Dataset.environmentUri == models.Environment.environmentUri, ) - .filter(models.DatasetTable.tableUri == table_uri) + .filter(DatasetTable.tableUri == table_uri) .first() ) @@ -84,11 +85,11 @@ def upsert(cls, session, table_uri: str): @classmethod def upsert_all(cls, session, dataset_uri: str): tables = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .filter( and_( - models.DatasetTable.datasetUri == dataset_uri, - models.DatasetTable.LastGlueTableStatus != 'Deleted', + DatasetTable.datasetUri == dataset_uri, + DatasetTable.LastGlueTableStatus != 'Deleted', ) ) .all() diff --git a/backend/dataall/modules/datasets/services/dataset_profiling_service.py b/backend/dataall/modules/datasets/services/dataset_profiling_service.py index 5b6ca8d41..01bc3dc57 100644 --- a/backend/dataall/modules/datasets/services/dataset_profiling_service.py +++ b/backend/dataall/modules/datasets/services/dataset_profiling_service.py @@ -2,7 +2,7 @@ from dataall.db import paginate, models from dataall.db.exceptions import ObjectNotFound -from dataall.modules.datasets.db.models import DatasetProfilingRun +from dataall.modules.datasets.db.models import DatasetProfilingRun, DatasetTable class DatasetProfilingService: @@ -18,7 +18,7 @@ def start_profiling( raise ObjectNotFound('Dataset', datasetUri) if tableUri and not GlueTableName: - table: models.DatasetTable = session.query(models.DatasetTable).get( + table: DatasetTable = session.query(DatasetTable).get( tableUri ) if not table: @@ -105,13 +105,13 @@ def list_table_profiling_runs(session, tableUri, filter): q = ( session.query(DatasetProfilingRun) .join( - models.DatasetTable, - models.DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, + DatasetTable, + DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, ) .filter( and_( - models.DatasetTable.tableUri == tableUri, - models.DatasetTable.GlueTableName + DatasetTable.tableUri == tableUri, + DatasetTable.GlueTableName == DatasetProfilingRun.GlueTableName, ) ) @@ -126,12 +126,12 @@ def get_table_last_profiling_run(session, tableUri): return ( session.query(DatasetProfilingRun) .join( - models.DatasetTable, - models.DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, + DatasetTable, + DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, ) - .filter(models.DatasetTable.tableUri == tableUri) + .filter(DatasetTable.tableUri == tableUri) .filter( - models.DatasetTable.GlueTableName + DatasetTable.GlueTableName == DatasetProfilingRun.GlueTableName ) .order_by(DatasetProfilingRun.created.desc()) @@ -143,12 +143,12 @@ def get_table_last_profiling_run_with_results(session, tableUri): return ( session.query(DatasetProfilingRun) .join( - models.DatasetTable, - models.DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, + DatasetTable, + DatasetTable.datasetUri == DatasetProfilingRun.datasetUri, ) - .filter(models.DatasetTable.tableUri == tableUri) + .filter(DatasetTable.tableUri == tableUri) .filter( - models.DatasetTable.GlueTableName + DatasetTable.GlueTableName == DatasetProfilingRun.GlueTableName ) .filter(DatasetProfilingRun.results.isnot(None)) diff --git a/backend/dataall/modules/datasets/services/dataset_share_service.py b/backend/dataall/modules/datasets/services/dataset_share_service.py index 9ca84a1cf..3503e86fe 100644 --- a/backend/dataall/modules/datasets/services/dataset_share_service.py +++ b/backend/dataall/modules/datasets/services/dataset_share_service.py @@ -8,7 +8,7 @@ from dataall.db import models, permissions from dataall.db.api import has_resource_perm, ShareItemSM from dataall.db.paginator import paginate -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable class DatasetShareService: @@ -41,9 +41,9 @@ def paginated_shared_with_environment_datasets( models.ShareObjectItem.itemType == ShareableType.Table.value, func.concat( - models.DatasetTable.GlueDatabaseName, + DatasetTable.GlueDatabaseName, '.', - models.DatasetTable.GlueTableName, + DatasetTable.GlueTableName, ), ), ( @@ -73,8 +73,8 @@ def paginated_shared_with_environment_datasets( == models.Environment.organizationUri, ) .outerjoin( - models.DatasetTable, - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, + DatasetTable, + models.ShareObjectItem.itemUri == DatasetTable.tableUri, ) .outerjoin( DatasetStorageLocation, @@ -137,9 +137,9 @@ def paginated_shared_with_environment_group_datasets( models.ShareObjectItem.itemType == ShareableType.Table.value, func.concat( - models.DatasetTable.GlueDatabaseName, + DatasetTable.GlueDatabaseName, '.', - models.DatasetTable.GlueTableName, + DatasetTable.GlueTableName, ), ), ( @@ -169,8 +169,8 @@ def paginated_shared_with_environment_group_datasets( == models.Environment.organizationUri, ) .outerjoin( - models.DatasetTable, - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, + DatasetTable, + models.ShareObjectItem.itemUri == DatasetTable.tableUri, ) .outerjoin( DatasetStorageLocation, diff --git a/backend/dataall/modules/datasets/services/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py index cd02eadf5..7776aa2ef 100644 --- a/backend/dataall/modules/datasets/services/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -6,7 +6,7 @@ from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment from dataall.db.models import Dataset from dataall.utils import json_utils -from dataall.modules.datasets.db.models import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetTable logger = logging.getLogger(__name__) @@ -22,14 +22,14 @@ def create_dataset_table( uri: str, data: dict = None, check_perm: bool = False, - ) -> models.DatasetTable: + ) -> DatasetTable: dataset = api.Dataset.get_dataset_by_uri(session, uri) exists = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .filter( and_( - models.DatasetTable.datasetUri == uri, - models.DatasetTable.GlueTableName == data['name'], + DatasetTable.datasetUri == uri, + DatasetTable.GlueTableName == data['name'], ) ) .count() @@ -41,7 +41,7 @@ def create_dataset_table( message=f'table: {data["name"]} already exist on dataset {uri}', ) - table = models.DatasetTable( + table = DatasetTable( datasetUri=uri, label=data['name'], name=data['name'], @@ -72,7 +72,7 @@ def create_dataset_table( group=group, permissions=permissions.DATASET_TABLE_READ, resource_uri=table.tableUri, - resource_type=models.DatasetTable.__name__, + resource_type=DatasetTable.__name__, ) return table @@ -87,13 +87,13 @@ def list_dataset_tables( check_perm: bool = False, ) -> dict: query = ( - session.query(models.DatasetTable) - .filter(models.DatasetTable.datasetUri == uri) - .order_by(models.DatasetTable.created.desc()) + session.query(DatasetTable) + .filter(DatasetTable.datasetUri == uri) + .order_by(DatasetTable.created.desc()) ) if data.get('term'): term = data.get('term') - query = query.filter(models.DatasetTable.label.ilike('%' + term + '%')) + query = query.filter(DatasetTable.label.ilike('%' + term + '%')) return paginate( query, page=data.get('page', 1), page_size=data.get('pageSize', 10) ).to_dict() @@ -107,7 +107,7 @@ def get_dataset_table( uri: str, data: dict = None, check_perm: bool = False, - ) -> models.DatasetTable: + ) -> DatasetTable: return DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) @staticmethod @@ -183,10 +183,10 @@ def query_dataset_tables_shared_with_env( """ share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() env_tables_shared = ( - session.query(models.DatasetTable) # all tables + session.query(DatasetTable) # all tables .join( models.ShareObjectItem, # found in ShareObjectItem - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri, + models.ShareObjectItem.itemUri == DatasetTable.tableUri, ) .join( models.ShareObject, # jump to share object @@ -218,7 +218,7 @@ def get_dataset_tables_shared_with_env( @staticmethod def get_dataset_table_by_uri(session, table_uri): - table: models.DatasetTable = session.query(models.DatasetTable).get(table_uri) + table: DatasetTable = session.query(DatasetTable).get(table_uri) if not table: raise exceptions.ObjectNotFound('DatasetTable', table_uri) return table @@ -229,8 +229,8 @@ def sync(session, datasetUri, glue_tables=None): dataset: Dataset = session.query(Dataset).get(datasetUri) if dataset: existing_tables = ( - session.query(models.DatasetTable) - .filter(models.DatasetTable.datasetUri == datasetUri) + session.query(DatasetTable) + .filter(DatasetTable.datasetUri == datasetUri) .all() ) existing_table_names = [e.GlueTableName for e in existing_tables] @@ -245,7 +245,7 @@ def sync(session, datasetUri, glue_tables=None): logger.info( f'Storing new table: {table} for dataset db {dataset.GlueDatabaseName}' ) - updated_table = models.DatasetTable( + updated_table = DatasetTable( datasetUri=dataset.datasetUri, label=table['Name'], name=table['Name'], @@ -272,13 +272,13 @@ def sync(session, datasetUri, glue_tables=None): group=group, permissions=permissions.DATASET_TABLE_READ, resource_uri=updated_table.tableUri, - resource_type=models.DatasetTable.__name__, + resource_type=DatasetTable.__name__, ) else: logger.info( f'Updating table: {table} for dataset db {dataset.GlueDatabaseName}' ) - updated_table: models.DatasetTable = ( + updated_table: DatasetTable = ( existing_dataset_tables_map.get(table['Name']) ) updated_table.GlueTableProperties = json_utils.to_json( @@ -345,13 +345,13 @@ def delete_all_table_columns(session, dataset_table): @staticmethod def get_table_by_s3_prefix(session, s3_prefix, accountid, region): - table: models.DatasetTable = ( - session.query(models.DatasetTable) + table: DatasetTable = ( + session.query(DatasetTable) .filter( and_( - models.DatasetTable.S3Prefix.startswith(s3_prefix), - models.DatasetTable.AWSAccountId == accountid, - models.DatasetTable.region == region, + DatasetTable.S3Prefix.startswith(s3_prefix), + DatasetTable.AWSAccountId == accountid, + DatasetTable.region == region, ) ) .first() diff --git a/backend/dataall/modules/datasets/tasks/subscription_service.py b/backend/dataall/modules/datasets/tasks/subscription_service.py index 901865812..ae1c522e0 100644 --- a/backend/dataall/modules/datasets/tasks/subscription_service.py +++ b/backend/dataall/modules/datasets/tasks/subscription_service.py @@ -17,7 +17,7 @@ from dataall.utils import json_utils from dataall.modules.datasets.services.dataset_table import DatasetTableService from dataall.modules.datasets.services.dataset_location import DatasetLocationService -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable root = logging.getLogger() root.setLevel(logging.INFO) @@ -68,7 +68,7 @@ def notify_consumers(engine, messages): @staticmethod def publish_table_update_message(engine, message): with engine.scoped_session() as session: - table: models.DatasetTable = DatasetTableService.get_table_by_s3_prefix( + table: DatasetTable = DatasetTableService.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), @@ -139,7 +139,7 @@ def publish_location_update_message(session, message): @staticmethod def store_dataquality_results(session, message): - table: models.DatasetTable = DatasetTableService.get_table_by_s3_prefix( + table: DatasetTable = DatasetTableService.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), @@ -207,7 +207,7 @@ def set_columns_type(quality_results, message): @staticmethod def publish_sns_message( - engine, message, dataset, share_items, prefix, table: models.DatasetTable = None + engine, message, dataset, share_items, prefix, table: DatasetTable = None ): with engine.scoped_session() as session: for item in share_items: @@ -290,7 +290,7 @@ def redshift_copy( message, dataset: models.Dataset, environment: models.Environment, - table: models.DatasetTable, + table: DatasetTable, ): log.info( f'Redshift copy starting ' diff --git a/backend/dataall/modules/datasets/tasks/tables_syncer.py b/backend/dataall/modules/datasets/tasks/tables_syncer.py index 7ae104cc9..0974df585 100644 --- a/backend/dataall/modules/datasets/tasks/tables_syncer.py +++ b/backend/dataall/modules/datasets/tasks/tables_syncer.py @@ -8,6 +8,7 @@ from dataall.aws.handlers.sts import SessionHelper from dataall.db import get_engine from dataall.db import models +from dataall.modules.datasets.db.models import DatasetTable from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.searchproxy import indexers from dataall.searchproxy.connect import connect @@ -68,8 +69,8 @@ def sync_tables(engine, es=None): ) tables = ( - session.query(models.DatasetTable) - .filter(models.DatasetTable.datasetUri == dataset.datasetUri) + session.query(DatasetTable) + .filter(DatasetTable.datasetUri == dataset.datasetUri) .all() ) diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 13ba44eea..9140cf3aa 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -5,6 +5,7 @@ from .. import db from ..db import models from dataall.searchproxy.upsert import BaseIndexer +from dataall.modules.datasets.db.models import DatasetTable log = logging.getLogger(__name__) @@ -74,11 +75,11 @@ def upsert(cls, session, dashboard_uri: str): def remove_deleted_tables(session, es, datasetUri: str): tables = ( - session.query(models.DatasetTable) + session.query(DatasetTable) .filter( and_( - models.DatasetTable.datasetUri == datasetUri, - models.DatasetTable.LastGlueTableStatus == 'Deleted', + DatasetTable.datasetUri == datasetUri, + DatasetTable.LastGlueTableStatus == 'Deleted', ) ) .all() diff --git a/backend/dataall/tasks/bucket_policy_updater.py b/backend/dataall/tasks/bucket_policy_updater.py index 9932f53ae..6cb4c51ea 100644 --- a/backend/dataall/tasks/bucket_policy_updater.py +++ b/backend/dataall/tasks/bucket_policy_updater.py @@ -9,8 +9,8 @@ from ..aws.handlers.sts import SessionHelper from ..db import get_engine -from ..db import models, api -from dataall.modules.datasets.db.models import DatasetStorageLocation +from ..db import models +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable root = logging.getLogger() root.setLevel(logging.INFO) @@ -168,18 +168,18 @@ def get_shared_tables(self, dataset) -> typing.List[models.ShareObjectItem]: with self.engine.scoped_session() as session: tables = ( session.query( - models.DatasetTable.GlueDatabaseName.label('GlueDatabaseName'), - models.DatasetTable.GlueTableName.label('GlueTableName'), - models.DatasetTable.S3Prefix.label('S3Prefix'), - models.DatasetTable.AWSAccountId.label('SourceAwsAccountId'), - models.DatasetTable.region.label('SourceRegion'), + DatasetTable.GlueDatabaseName.label('GlueDatabaseName'), + DatasetTable.GlueTableName.label('GlueTableName'), + DatasetTable.S3Prefix.label('S3Prefix'), + DatasetTable.AWSAccountId.label('SourceAwsAccountId'), + DatasetTable.region.label('SourceRegion'), models.Environment.AwsAccountId.label('TargetAwsAccountId'), models.Environment.region.label('TargetRegion'), ) .join( models.ShareObjectItem, and_( - models.ShareObjectItem.itemUri == models.DatasetTable.tableUri + models.ShareObjectItem.itemUri == DatasetTable.tableUri ), ) .join( @@ -193,8 +193,8 @@ def get_shared_tables(self, dataset) -> typing.List[models.ShareObjectItem]: ) .filter( and_( - models.DatasetTable.datasetUri == dataset.datasetUri, - models.DatasetTable.deleted.is_(None), + DatasetTable.datasetUri == dataset.datasetUri, + DatasetTable.deleted.is_(None), models.ShareObjectItem.status == models.Enums.ShareObjectStatus.Approved.value, ) diff --git a/backend/dataall/tasks/data_sharing/share_managers/lf_share_manager.py b/backend/dataall/tasks/data_sharing/share_managers/lf_share_manager.py index b74e34e93..2b7eaf20a 100644 --- a/backend/dataall/tasks/data_sharing/share_managers/lf_share_manager.py +++ b/backend/dataall/tasks/data_sharing/share_managers/lf_share_manager.py @@ -11,7 +11,8 @@ from ....aws.handlers.sts import SessionHelper from ....aws.handlers.ram import Ram from ....db import api, exceptions, models -from ....utils.alarm_service import AlarmService +from dataall.modules.datasets.db.models import DatasetTable +from dataall.utils.alarm_service import AlarmService logger = logging.getLogger(__name__) @@ -22,8 +23,8 @@ def __init__( session, dataset: models.Dataset, share: models.ShareObject, - shared_tables: [models.DatasetTable], - revoked_tables: [models.DatasetTable], + shared_tables: [DatasetTable], + revoked_tables: [DatasetTable], source_environment: models.Environment, target_environment: models.Environment, env_group: models.EnvironmentGroup, @@ -82,7 +83,7 @@ def build_shared_db_name(self) -> str: """ return (self.dataset.GlueDatabaseName + '_shared_' + self.share.shareUri)[:254] - def build_share_data(self, table: models.DatasetTable) -> dict: + def build_share_data(self, table: DatasetTable) -> dict: """ Build aws dict for boto3 operations on Glue and LF from share data Parameters @@ -110,7 +111,7 @@ def build_share_data(self, table: models.DatasetTable) -> dict: return data def check_share_item_exists_on_glue_catalog( - self, share_item: models.ShareObjectItem, table: models.DatasetTable + self, share_item: models.ShareObjectItem, table: DatasetTable ) -> None: """ Checks if a table in the share request @@ -271,12 +272,12 @@ def create_resource_link(cls, **data) -> dict: ) raise e - def revoke_table_resource_link_access(self, table: models.DatasetTable, principals: [str]): + def revoke_table_resource_link_access(self, table: DatasetTable, principals: [str]): """ Revokes access to glue table resource link Parameters ---------- - table : models.DatasetTable + table : DatasetTable principals: List of strings. IAM role arn and Quicksight groups Returns @@ -332,7 +333,7 @@ def revoke_source_table_access(self, table, principals: [str]): Revokes access to the source glue table Parameters ---------- - table : models.DatasetTable + table : DatasetTable Returns ------- @@ -366,7 +367,7 @@ def revoke_source_table_access(self, table, principals: [str]): ) return True - def delete_resource_link_table(self, table: models.DatasetTable): + def delete_resource_link_table(self, table: DatasetTable): logger.info(f'Deleting shared table {table.GlueTableName}') if not Glue.table_exists( @@ -502,7 +503,7 @@ def delete_ram_resource_shares(self, resource_arn: str) -> [dict]: def handle_share_failure( self, - table: models.DatasetTable, + table: DatasetTable, share_item: models.ShareObjectItem, error: Exception, ) -> bool: @@ -532,7 +533,7 @@ def handle_share_failure( def handle_revoke_failure( self, - table: models.DatasetTable, + table: DatasetTable, share_item: models.ShareObjectItem, error: Exception, ) -> bool: diff --git a/backend/dataall/tasks/data_sharing/share_processors/lf_process_cross_account_share.py b/backend/dataall/tasks/data_sharing/share_processors/lf_process_cross_account_share.py index ffdf7d487..dfceec978 100644 --- a/backend/dataall/tasks/data_sharing/share_processors/lf_process_cross_account_share.py +++ b/backend/dataall/tasks/data_sharing/share_processors/lf_process_cross_account_share.py @@ -4,6 +4,7 @@ from ..share_managers import LFShareManager from ....aws.handlers.ram import Ram from ....db import models, api +from dataall.modules.datasets.db.models import DatasetTable log = logging.getLogger(__name__) @@ -14,8 +15,8 @@ def __init__( session, dataset: models.Dataset, share: models.ShareObject, - shared_tables: [models.DatasetTable], - revoked_tables: [models.DatasetTable], + shared_tables: [DatasetTable], + revoked_tables: [DatasetTable], source_environment: models.Environment, target_environment: models.Environment, env_group: models.EnvironmentGroup, diff --git a/backend/dataall/tasks/data_sharing/share_processors/lf_process_same_account_share.py b/backend/dataall/tasks/data_sharing/share_processors/lf_process_same_account_share.py index 4b5ad4096..3ea939b4f 100644 --- a/backend/dataall/tasks/data_sharing/share_processors/lf_process_same_account_share.py +++ b/backend/dataall/tasks/data_sharing/share_processors/lf_process_same_account_share.py @@ -1,7 +1,8 @@ import logging from ..share_managers import LFShareManager -from ....db import models, api +from dataall.db import models, api +from dataall.modules.datasets.db.models import DatasetTable log = logging.getLogger(__name__) @@ -12,8 +13,8 @@ def __init__( session, dataset: models.Dataset, share: models.ShareObject, - shared_tables: [models.DatasetTable], - revoked_tables: [models.DatasetTable], + shared_tables: [DatasetTable], + revoked_tables: [DatasetTable], source_environment: models.Environment, target_environment: models.Environment, env_group: models.EnvironmentGroup, diff --git a/backend/dataall/utils/alarm_service.py b/backend/dataall/utils/alarm_service.py index b414e1ed0..a1d0a6d5b 100644 --- a/backend/dataall/utils/alarm_service.py +++ b/backend/dataall/utils/alarm_service.py @@ -11,6 +11,7 @@ from ..aws.handlers.sts import SessionHelper from ..db import models +from dataall.modules.datasets.db.models import DatasetTable logger = logging.getLogger(__name__) @@ -42,7 +43,7 @@ def trigger_stack_deployment_failure_alarm(self, stack: models.Stack): def trigger_table_sharing_failure_alarm( self, - table: models.DatasetTable, + table: DatasetTable, share: models.ShareObject, target_environment: models.Environment, ): @@ -74,7 +75,7 @@ def trigger_table_sharing_failure_alarm( def trigger_revoke_table_sharing_failure_alarm( self, - table: models.DatasetTable, + table: DatasetTable, share: models.ShareObject, target_environment: models.Environment, ): diff --git a/backend/migrations/versions/d05f9a5b215e_backfill_dataset_table_permissions.py b/backend/migrations/versions/d05f9a5b215e_backfill_dataset_table_permissions.py index d75e7d6cc..32ca6abe0 100644 --- a/backend/migrations/versions/d05f9a5b215e_backfill_dataset_table_permissions.py +++ b/backend/migrations/versions/d05f9a5b215e_backfill_dataset_table_permissions.py @@ -6,7 +6,6 @@ """ from alembic import op -import sqlalchemy as sa from sqlalchemy import orm, Column, String, Text, DateTime, and_ from sqlalchemy.orm import query_expression from sqlalchemy.dialects import postgresql @@ -95,7 +94,7 @@ def upgrade(): resource_uri=table.tableUri, group=group, permissions=permissions.DATASET_TABLE_READ, - resource_type=models.DatasetTable.__name__, + resource_type=DatasetTable.__name__, ) print('dataset table permissions updated successfully for owners/stewards') except Exception as e: @@ -120,7 +119,7 @@ def upgrade(): group=share.principalId, permissions=permissions.DATASET_TABLE_READ, resource_uri=shared_table.itemUri, - resource_type=models.DatasetTable.__name__, + resource_type=DatasetTable.__name__, ) print('dataset table permissions updated for all shared tables') except Exception as e: diff --git a/tests/api/conftest.py b/tests/api/conftest.py index f959be417..aff658520 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -2,7 +2,7 @@ from .client import * from dataall.db import models from dataall.api import constants -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -506,7 +506,7 @@ def factory( def share_item(db): def factory( share: models.ShareObject, - table: models.DatasetTable, + table: DatasetTable, status: str ) -> models.ShareObjectItem: with db.scoped_session() as session: @@ -554,12 +554,12 @@ def factory(dataset: models.Dataset, name, username) -> DatasetStorageLocation: def table(db): cache = {} - def factory(dataset: models.Dataset, name, username) -> models.DatasetTable: + def factory(dataset: models.Dataset, name, username) -> DatasetTable: key = f'{dataset.datasetUri}-{name}' if cache.get(key): return cache.get(key) with db.scoped_session() as session: - table = models.DatasetTable( + table = DatasetTable( name=name, label=name, owner=username, diff --git a/tests/api/test_dataset.py b/tests/api/test_dataset.py index 359a780b4..9dc5d37f5 100644 --- a/tests/api/test_dataset.py +++ b/tests/api/test_dataset.py @@ -3,7 +3,7 @@ import pytest import dataall -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -227,7 +227,7 @@ def test_add_tables(table, dataset1, db): table(dataset=dataset1, name=f'table{i+1}', username=dataset1.owner) with db.scoped_session() as session: - nb = session.query(dataall.db.models.DatasetTable).count() + nb = session.query(DatasetTable).count() assert nb == 10 diff --git a/tests/api/test_dataset_profiling.py b/tests/api/test_dataset_profiling.py index 8d708e94d..8f7b1bc84 100644 --- a/tests/api/test_dataset_profiling.py +++ b/tests/api/test_dataset_profiling.py @@ -2,7 +2,7 @@ import pytest import dataall -from dataall.modules.datasets.db.models import DatasetProfilingRun +from dataall.modules.datasets.db.models import DatasetProfilingRun, DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -33,7 +33,7 @@ def test_add_tables(table, dataset1, db): table(dataset=dataset1, name=f'table{i+1}', username=dataset1.owner) with db.scoped_session() as session: - nb = session.query(dataall.db.models.DatasetTable).count() + nb = session.query(DatasetTable).count() assert nb == 10 @@ -141,8 +141,8 @@ def test_get_table_profiling_run( table = table(dataset=dataset1, name='table1', username=dataset1.owner) with db.scoped_session() as session: table = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.GlueTableName == 'table1') + session.query(DatasetTable) + .filter(DatasetTable.GlueTableName == 'table1') .first() ) response = client.query( @@ -178,8 +178,8 @@ def test_list_table_profiling_runs( table1000 = table(dataset=dataset1, name='table1000', username=dataset1.owner) with db.scoped_session() as session: table = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.GlueTableName == 'table1') + session.query(DatasetTable) + .filter(DatasetTable.GlueTableName == 'table1') .first() ) module_mocker.patch( diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index 88140b68c..7ed3732a4 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -4,7 +4,7 @@ import dataall from dataall.modules.datasets.services.dataset_table import DatasetTableService -from dataall.modules.datasets.db.models import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -76,7 +76,7 @@ def test_add_tables(table, dataset1, db): table(dataset=dataset1, name=f'table{i+1}', username=dataset1.owner) with db.scoped_session() as session: - nb = session.query(dataall.db.models.DatasetTable).count() + nb = session.query(DatasetTable).count() assert nb == 10 @@ -109,8 +109,8 @@ def test_update_table(client, env1, table, dataset1, db, user, group): def test_add_columns(table, dataset1, db): with db.scoped_session() as session: table = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.name == 'table1') + session.query(DatasetTable) + .filter(DatasetTable.name == 'table1') .first() ) table_col = DatasetTableColumn( @@ -182,8 +182,8 @@ def test_list_dataset_tables(client, dataset1): def test_update_dataset_table_column(client, table, dataset1, db): with db.scoped_session() as session: table = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.name == 'table1') + session.query(DatasetTable) + .filter(DatasetTable.name == 'table1') .first() ) column = ( @@ -231,8 +231,8 @@ def test_update_dataset_table_column(client, table, dataset1, db): def test_sync_tables_and_columns(client, table, dataset1, db): with db.scoped_session() as session: table = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.name == 'table1') + session.query(DatasetTable) + .filter(DatasetTable.name == 'table1') .first() ) column = ( @@ -292,9 +292,9 @@ def test_sync_tables_and_columns(client, table, dataset1, db): ] assert DatasetTableService.sync(session, dataset1.datasetUri, glue_tables) - new_table: dataall.db.models.DatasetTable = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.name == 'new_table') + new_table: DatasetTable = ( + session.query(DatasetTable) + .filter(DatasetTable.name == 'new_table') .first() ) assert new_table @@ -309,9 +309,9 @@ def test_sync_tables_and_columns(client, table, dataset1, db): assert columns[0].columnType == 'column' assert columns[1].columnType == 'partition_0' - existing_table: dataall.db.models.DatasetTable = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.name == 'table1') + existing_table: DatasetTable = ( + session.query(DatasetTable) + .filter(DatasetTable.name == 'table1') .first() ) assert existing_table @@ -326,9 +326,9 @@ def test_sync_tables_and_columns(client, table, dataset1, db): assert columns[0].columnType == 'column' assert columns[1].columnType == 'partition_0' - deleted_table: dataall.db.models.DatasetTable = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.name == 'table2') + deleted_table: DatasetTable = ( + session.query(DatasetTable) + .filter(DatasetTable.name == 'table2') .first() ) assert deleted_table.LastGlueTableStatus == 'Deleted' diff --git a/tests/api/test_glossary.py b/tests/api/test_glossary.py index bb7f34516..1aa15ce73 100644 --- a/tests/api/test_glossary.py +++ b/tests/api/test_glossary.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List from dataall.db import models -from dataall.modules.datasets.db.models import DatasetTableColumn +from dataall.modules.datasets.db.models import DatasetTableColumn, DatasetTable import pytest @@ -32,9 +32,9 @@ def _dataset(db, _env, _org, group, user, dataset) -> models.Dataset: @pytest.fixture(scope='module', autouse=True) -def _table(db, _dataset) -> models.DatasetTable: +def _table(db, _dataset) -> DatasetTable: with db.scoped_session() as session: - t = models.DatasetTable( + t = DatasetTable( datasetUri=_dataset.datasetUri, label='table', AWSAccountId=_dataset.AwsAccountId, diff --git a/tests/api/test_share.py b/tests/api/test_share.py index 58309aa01..d951a15f8 100644 --- a/tests/api/test_share.py +++ b/tests/api/test_share.py @@ -3,6 +3,7 @@ import pytest import dataall +from dataall.modules.datasets.db.models import DatasetTable def random_table_name(): @@ -64,7 +65,7 @@ def tables1(table: typing.Callable, dataset1: dataall.db.models.Dataset): @pytest.fixture(scope="module", autouse=True) def table1(table: typing.Callable, dataset1: dataall.db.models.Dataset, - user: dataall.db.models.User) -> dataall.db.models.DatasetTable: + user: dataall.db.models.User) -> DatasetTable: yield table( dataset=dataset1, name="table1", @@ -112,7 +113,7 @@ def tables2(table, dataset2): @pytest.fixture(scope="module", autouse=True) def table2(table: typing.Callable, dataset2: dataall.db.models.Dataset, - user2: dataall.db.models.User) -> dataall.db.models.DatasetTable: + user2: dataall.db.models.User) -> DatasetTable: yield table( dataset=dataset2, name="table2", @@ -195,7 +196,7 @@ def share1_draft( def share1_item_pa( share_item: typing.Callable, share1_draft: dataall.db.models.ShareObject, - table1: dataall.db.models.DatasetTable + table1: DatasetTable ) -> dataall.db.models.ShareObjectItem: # Cleaned up with share1_draft yield share_item( @@ -270,7 +271,7 @@ def share2_submitted( def share2_item_pa( share_item: typing.Callable, share2_submitted: dataall.db.models.ShareObject, - table1: dataall.db.models.DatasetTable + table1: DatasetTable ) -> dataall.db.models.ShareObjectItem: # Cleaned up with share2 yield share_item( @@ -345,7 +346,7 @@ def share3_processed( def share3_item_shared( share_item: typing.Callable, share3_processed: dataall.db.models.ShareObject, - table1: dataall.db.models.DatasetTable + table1:DatasetTable ) -> dataall.db.models.ShareObjectItem: # Cleaned up with share3 yield share_item( diff --git a/tests/cdkproxy/conftest.py b/tests/cdkproxy/conftest.py index c83d0028b..c223f4a37 100644 --- a/tests/cdkproxy/conftest.py +++ b/tests/cdkproxy/conftest.py @@ -1,6 +1,7 @@ import pytest from dataall.db import models, api +from dataall.modules.datasets.db.models import DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -121,9 +122,9 @@ def dataset(db, env: models.Environment) -> models.Dataset: @pytest.fixture(scope='module', autouse=True) -def table(db, dataset: models.Dataset) -> models.DatasetTable: +def table(db, dataset: models.Dataset) -> DatasetTable: with db.scoped_session() as session: - table = models.DatasetTable( + table = DatasetTable( label='thistable', owner='me', datasetUri=dataset.datasetUri, diff --git a/tests/searchproxy/test_indexers.py b/tests/searchproxy/test_indexers.py index fd31506f1..4fad9e6d2 100644 --- a/tests/searchproxy/test_indexers.py +++ b/tests/searchproxy/test_indexers.py @@ -6,7 +6,7 @@ from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from dataall.searchproxy import indexers -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer @@ -74,7 +74,7 @@ def dataset(org, env, db): @pytest.fixture(scope='module', autouse=True) def table(org, env, db, dataset): with db.scoped_session() as session: - table = dataall.db.models.DatasetTable( + table = DatasetTable( datasetUri=dataset.datasetUri, AWSAccountId='12345678901', S3Prefix='S3prefix', diff --git a/tests/tasks/conftest.py b/tests/tasks/conftest.py index 7e6f0d71a..267d3ef73 100644 --- a/tests/tasks/conftest.py +++ b/tests/tasks/conftest.py @@ -2,7 +2,7 @@ from dataall.db import models from dataall.api import constants -from dataall.modules.datasets.db.models import DatasetStorageLocation +from dataall.modules.datasets.db.models import DatasetStorageLocation, DatasetTable @pytest.fixture(scope="module") @@ -148,10 +148,10 @@ def factory(dataset: models.Dataset, label: str) -> DatasetStorageLocation: @pytest.fixture(scope='module') def table(db): - def factory(dataset: models.Dataset, label: str) -> models.DatasetTable: + def factory(dataset: models.Dataset, label: str) -> DatasetTable: with db.scoped_session() as session: - table = models.DatasetTable( + table = DatasetTable( name=label, label=label, owner=dataset.owner, @@ -218,7 +218,7 @@ def factory( def share_item_table(db): def factory( share: models.ShareObject, - table: models.DatasetTable, + table: DatasetTable, status: str, ) -> models.ShareObjectItem: with db.scoped_session() as session: diff --git a/tests/tasks/test_catalog_indexer.py b/tests/tasks/test_catalog_indexer.py index 31b0f14d4..8da53e3d2 100644 --- a/tests/tasks/test_catalog_indexer.py +++ b/tests/tasks/test_catalog_indexer.py @@ -1,5 +1,6 @@ import pytest import dataall +from dataall.modules.datasets.db.models import DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -65,7 +66,7 @@ def sync_dataset(org, env, db): @pytest.fixture(scope='module', autouse=True) def table(org, env, db, sync_dataset): with db.scoped_session() as session: - table = dataall.db.models.DatasetTable( + table = DatasetTable( datasetUri=sync_dataset.datasetUri, AWSAccountId='12345678901', S3Prefix='S3prefix', diff --git a/tests/tasks/test_lf_share_manager.py b/tests/tasks/test_lf_share_manager.py index bee190258..1ff99ba43 100644 --- a/tests/tasks/test_lf_share_manager.py +++ b/tests/tasks/test_lf_share_manager.py @@ -10,6 +10,7 @@ from dataall.db import models from dataall.api import constants +from dataall.modules.datasets.db.models import DatasetTable from dataall.tasks.data_sharing.share_processors.lf_process_cross_account_share import ProcessLFCrossAccountShare from dataall.tasks.data_sharing.share_processors.lf_process_same_account_share import ProcessLFSameAccountShare @@ -94,7 +95,7 @@ def dataset1(dataset: Callable, org1: models.Organization, source_environment: m @pytest.fixture(scope="module") -def table1(table: Callable, dataset1: models.Dataset) -> models.DatasetTable: +def table1(table: Callable, dataset1: models.Dataset) -> DatasetTable: yield table( dataset=dataset1, label="table1" @@ -102,7 +103,7 @@ def table1(table: Callable, dataset1: models.Dataset) -> models.DatasetTable: @pytest.fixture(scope="module") -def table2(table: Callable, dataset1: models.Dataset) -> models.DatasetTable: +def table2(table: Callable, dataset1: models.Dataset) -> DatasetTable: yield table( dataset=dataset1, label="table2" @@ -133,7 +134,7 @@ def share_cross_account( @pytest.fixture(scope="module") def share_item_same_account(share_item_table: Callable, share_same_account: models.ShareObject, - table1: models.DatasetTable) -> models.ShareObjectItem: + table1: DatasetTable) -> models.ShareObjectItem: yield share_item_table( share=share_same_account, table=table1, @@ -142,7 +143,7 @@ def share_item_same_account(share_item_table: Callable, share_same_account: mode @pytest.fixture(scope="module") def revoke_item_same_account(share_item_table: Callable, share_same_account: models.ShareObject, - table2: models.DatasetTable) -> models.ShareObjectItem: + table2: DatasetTable) -> models.ShareObjectItem: yield share_item_table( share=share_same_account, table=table2, @@ -151,7 +152,7 @@ def revoke_item_same_account(share_item_table: Callable, share_same_account: mod @pytest.fixture(scope="module") def share_item_cross_account(share_item_table: Callable, share_cross_account: models.ShareObject, - table1: models.DatasetTable) -> models.ShareObjectItem: + table1: DatasetTable) -> models.ShareObjectItem: yield share_item_table( share=share_cross_account, table=table1, @@ -160,7 +161,7 @@ def share_item_cross_account(share_item_table: Callable, share_cross_account: mo @pytest.fixture(scope="module") def revoke_item_cross_account(share_item_table: Callable, share_cross_account: models.ShareObject, - table2: models.DatasetTable) -> models.ShareObjectItem: + table2: DatasetTable) -> models.ShareObjectItem: yield share_item_table( share=share_cross_account, table=table2, @@ -294,7 +295,7 @@ def test_check_share_item_exists_on_glue_catalog( db, processor_same_account: ProcessLFSameAccountShare, processor_cross_account: ProcessLFCrossAccountShare, - table1: models.DatasetTable, + table1: DatasetTable, share_item_same_account: models.ShareObjectItem, share_item_cross_account: models.ShareObjectItem, mocker, @@ -332,7 +333,7 @@ def test_build_share_data( source_environment: models.Environment, target_environment: models.Environment, dataset1: models.Dataset, - table1: models.DatasetTable, + table1: DatasetTable, ): data_same_account = { 'source': { @@ -380,7 +381,7 @@ def test_create_resource_link( source_environment: models.Environment, target_environment: models.Environment, dataset1: models.Dataset, - table1: models.DatasetTable, + table1: DatasetTable, mocker, ): sts_mock = mocker.patch( @@ -463,7 +464,7 @@ def test_revoke_table_resource_link_access( source_environment: models.Environment, target_environment: models.Environment, dataset1: models.Dataset, - table2: models.DatasetTable, + table2: DatasetTable, mocker, ): glue_mock = mocker.patch( @@ -511,7 +512,7 @@ def test_revoke_source_table_access( source_environment: models.Environment, target_environment: models.Environment, dataset1: models.Dataset, - table2: models.DatasetTable, + table2: DatasetTable, mocker, ): glue_mock = mocker.patch( @@ -554,7 +555,7 @@ def test_delete_resource_link_table( source_environment: models.Environment, target_environment: models.Environment, dataset1: models.Dataset, - table2: models.DatasetTable, + table2: DatasetTable, mocker, ): glue_mock = mocker.patch( @@ -596,7 +597,7 @@ def test_delete_shared_database( source_environment: models.Environment, target_environment: models.Environment, dataset1: models.Dataset, - table1: models.DatasetTable, + table1: DatasetTable, mocker, ): glue_mock = mocker.patch( @@ -625,8 +626,8 @@ def test_revoke_external_account_access_on_source_account( source_environment: models.Environment, target_environment: models.Environment, dataset1: models.Dataset, - table1: models.DatasetTable, - table2: models.DatasetTable, + table1: DatasetTable, + table2: DatasetTable, mocker, ): lf_mock = mocker.patch( @@ -649,7 +650,7 @@ def test_handle_share_failure( processor_cross_account: ProcessLFCrossAccountShare, share_item_same_account: models.ShareObjectItem, share_item_cross_account: models.ShareObjectItem, - table1: models.DatasetTable, + table1: DatasetTable, mocker, ): @@ -678,7 +679,7 @@ def test_handle_revoke_failure( processor_cross_account: ProcessLFCrossAccountShare, revoke_item_same_account: models.ShareObjectItem, revoke_item_cross_account: models.ShareObjectItem, - table1: models.DatasetTable, + table1: DatasetTable, mocker, ): # Given diff --git a/tests/tasks/test_policies.py b/tests/tasks/test_policies.py index d51cc2ac7..ca8c259c6 100644 --- a/tests/tasks/test_policies.py +++ b/tests/tasks/test_policies.py @@ -1,4 +1,5 @@ from dataall.api.constants import OrganisationUserRole +from dataall.modules.datasets.db.models import DatasetTable from dataall.tasks.bucket_policy_updater import BucketPoliciesUpdater import pytest import dataall @@ -68,7 +69,7 @@ def sync_dataset(org, env, db): @pytest.fixture(scope='module', autouse=True) def table(org, env, db, sync_dataset): with db.scoped_session() as session: - table = dataall.db.models.DatasetTable( + table = DatasetTable( datasetUri=sync_dataset.datasetUri, AWSAccountId='12345678901', S3Prefix='S3prefix', diff --git a/tests/tasks/test_subscriptions.py b/tests/tasks/test_subscriptions.py index 874b8ccab..61c70d174 100644 --- a/tests/tasks/test_subscriptions.py +++ b/tests/tasks/test_subscriptions.py @@ -2,6 +2,7 @@ import dataall from dataall.api.constants import OrganisationUserRole +from dataall.modules.datasets.db.models import DatasetTable @pytest.fixture(scope='module') @@ -93,7 +94,7 @@ def share( ): with db.scoped_session() as session: - table = dataall.db.models.DatasetTable( + table = DatasetTable( label='foo', name='foo', owner='alice', diff --git a/tests/tasks/test_tables_sync.py b/tests/tasks/test_tables_sync.py index 9d8282e65..ff6f8271e 100644 --- a/tests/tasks/test_tables_sync.py +++ b/tests/tasks/test_tables_sync.py @@ -1,6 +1,7 @@ import pytest import dataall from dataall.api.constants import OrganisationUserRole +from dataall.modules.datasets.db.models import DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -76,7 +77,7 @@ def sync_dataset(org, env, db): @pytest.fixture(scope='module', autouse=True) def table(org, env, db, sync_dataset): with db.scoped_session() as session: - table = dataall.db.models.DatasetTable( + table = DatasetTable( datasetUri=sync_dataset.datasetUri, AWSAccountId='12345678901', S3Prefix='S3prefix', @@ -163,9 +164,9 @@ def test_tables_sync(db, org, env, sync_dataset, table, mocker): processed_tables = dataall.modules.datasets.tasks.tables_syncer.sync_tables(engine=db) assert len(processed_tables) == 2 with db.scoped_session() as session: - saved_table: dataall.db.models.DatasetTable = ( - session.query(dataall.db.models.DatasetTable) - .filter(dataall.db.models.DatasetTable.GlueTableName == 'table1') + saved_table: DatasetTable = ( + session.query(DatasetTable) + .filter(DatasetTable.GlueTableName == 'table1') .first() ) assert saved_table From ba45ca50bc422df343aea81749ab010b91743a71 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 21 Apr 2023 11:18:49 +0200 Subject: [PATCH 62/67] Moved delete_doc to BaseIndexer --- .../api/Objects/Dashboard/resolvers.py | 2 +- .../dataall/api/Objects/Dataset/resolvers.py | 10 ++++---- .../api/storage_location/resolvers.py | 2 +- .../modules/datasets/api/table/resolvers.py | 2 +- .../datasets/indexers/table_indexer.py | 16 +++++++++++++ backend/dataall/searchproxy/indexers.py | 24 ------------------- backend/dataall/searchproxy/upsert.py | 6 +++++ tests/api/conftest.py | 2 +- 8 files changed, 30 insertions(+), 34 deletions(-) diff --git a/backend/dataall/api/Objects/Dashboard/resolvers.py b/backend/dataall/api/Objects/Dashboard/resolvers.py index 84a2a1bcc..94372f5d1 100644 --- a/backend/dataall/api/Objects/Dashboard/resolvers.py +++ b/backend/dataall/api/Objects/Dashboard/resolvers.py @@ -311,7 +311,7 @@ def delete_dashboard(context: Context, source, dashboardUri: str = None): data=None, check_perm=True, ) - indexers.delete_doc(es=context.es, doc_id=dashboardUri) + DashboardIndexer.delete_doc(doc_id=dashboardUri) return True diff --git a/backend/dataall/api/Objects/Dataset/resolvers.py b/backend/dataall/api/Objects/Dataset/resolvers.py index 79e306c9e..c17deef76 100644 --- a/backend/dataall/api/Objects/Dataset/resolvers.py +++ b/backend/dataall/api/Objects/Dataset/resolvers.py @@ -328,9 +328,7 @@ def sync_tables(context: Context, source, datasetUri: str = None): DatasetTableIndexer.upsert_all( session=session, dataset_uri=dataset.datasetUri ) - indexers.remove_deleted_tables( - session=session, es=context.es, datasetUri=dataset.datasetUri - ) + DatasetTableIndexer.remove_all_deleted(session=session, dataset_uri=dataset.datasetUri) return Dataset.paginated_dataset_tables( session=session, username=context.username, @@ -557,13 +555,13 @@ def delete_dataset( tables = [t.tableUri for t in Dataset.get_dataset_tables(session, datasetUri)] for uri in tables: - indexers.delete_doc(es=context.es, doc_id=uri) + DatasetIndexer.delete_doc(doc_id=uri) folders = [f.locationUri for f in DatasetLocationService.get_dataset_folders(session, datasetUri)] for uri in folders: - indexers.delete_doc(es=context.es, doc_id=uri) + DatasetIndexer.delete_doc(doc_id=uri) - indexers.delete_doc(es=context.es, doc_id=datasetUri) + DatasetIndexer.delete_doc(doc_id=datasetUri) Dataset.delete_dataset( session=session, diff --git a/backend/dataall/modules/datasets/api/storage_location/resolvers.py b/backend/dataall/modules/datasets/api/storage_location/resolvers.py index 09cf4b14a..6f8d82e43 100644 --- a/backend/dataall/modules/datasets/api/storage_location/resolvers.py +++ b/backend/dataall/modules/datasets/api/storage_location/resolvers.py @@ -88,7 +88,7 @@ def remove_storage_location(context, source, locationUri: str = None): data={'locationUri': location.locationUri}, check_perm=True, ) - indexers.delete_doc(es=context.es, doc_id=location.locationUri) + DatasetLocationIndexer.delete_doc(doc_id=location.locationUri) return True diff --git a/backend/dataall/modules/datasets/api/table/resolvers.py b/backend/dataall/modules/datasets/api/table/resolvers.py index ea16cae79..ce884bcbe 100644 --- a/backend/dataall/modules/datasets/api/table/resolvers.py +++ b/backend/dataall/modules/datasets/api/table/resolvers.py @@ -99,7 +99,7 @@ def delete_table(context, source, tableUri: str = None): }, check_perm=True, ) - indexers.delete_doc(es=context.es, doc_id=tableUri) + DatasetTableIndexer.delete_doc(doc_id=tableUri) return True diff --git a/backend/dataall/modules/datasets/indexers/table_indexer.py b/backend/dataall/modules/datasets/indexers/table_indexer.py index 4c96eea6d..2fe9451e1 100644 --- a/backend/dataall/modules/datasets/indexers/table_indexer.py +++ b/backend/dataall/modules/datasets/indexers/table_indexer.py @@ -97,3 +97,19 @@ def upsert_all(cls, session, dataset_uri: str): for table in tables: DatasetTableIndexer.upsert(session=session, table_uri=table.tableUri) return tables + + @classmethod + def remove_all_deleted(cls, session, dataset_uri: str): + tables = ( + session.query(DatasetTable) + .filter( + and_( + DatasetTable.datasetUri == dataset_uri, + DatasetTable.LastGlueTableStatus == 'Deleted', + ) + ) + .all() + ) + for table in tables: + cls.delete_doc(doc_id=table.tableUri) + return tables diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 9140cf3aa..eba878fa2 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -1,11 +1,8 @@ import logging -from sqlalchemy import and_ - from .. import db from ..db import models from dataall.searchproxy.upsert import BaseIndexer -from dataall.modules.datasets.db.models import DatasetTable log = logging.getLogger(__name__) @@ -71,24 +68,3 @@ def upsert(cls, session, dashboard_uri: str): }, ) return dashboard - - -def remove_deleted_tables(session, es, datasetUri: str): - tables = ( - session.query(DatasetTable) - .filter( - and_( - DatasetTable.datasetUri == datasetUri, - DatasetTable.LastGlueTableStatus == 'Deleted', - ) - ) - .all() - ) - for table in tables: - delete_doc(es, doc_id=table.tableUri) - return tables - - -def delete_doc(es, doc_id, index='dataall-index'): - es.delete(index=index, id=doc_id, ignore=[400, 404]) - return True diff --git a/backend/dataall/searchproxy/upsert.py b/backend/dataall/searchproxy/upsert.py index 9eb2e3125..a787032dd 100644 --- a/backend/dataall/searchproxy/upsert.py +++ b/backend/dataall/searchproxy/upsert.py @@ -30,6 +30,12 @@ def es(cls): def upsert(session, target_id): raise NotImplementedError("Method upsert is not implemented") + @classmethod + def delete_doc(cls, doc_id): + es = cls.es() + es.delete(index=cls._INDEX, id=doc_id, ignore=[400, 404]) + return True + @classmethod def _index(cls, doc_id, doc): es = cls.es() diff --git a/tests/api/conftest.py b/tests/api/conftest.py index aff658520..37fef4f10 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -40,7 +40,7 @@ def patch_es(module_mocker): return_value={} ) module_mocker.patch('dataall.searchproxy.indexers.DashboardIndexer.upsert', return_value={}) - module_mocker.patch('dataall.searchproxy.indexers.delete_doc', return_value={}) + module_mocker.patch('dataall.searchproxy.upsert.BaseIndexer.delete_doc', return_value={}) @pytest.fixture(scope='module', autouse=True) From dc8ff72a817e4403cd3cb1481908228a7709efaa Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Fri, 21 Apr 2023 11:28:38 +0200 Subject: [PATCH 63/67] Lazy creation of connection to OpenSearch --- backend/api_handler.py | 6 +----- .../dataall/api/Objects/Glossary/registry.py | 8 +++----- .../dataall/api/Objects/Glossary/resolvers.py | 2 +- backend/dataall/api/Objects/Vote/resolvers.py | 4 ++-- backend/dataall/api/context.py | 2 -- backend/dataall/core/context.py | 2 -- .../modules/datasets/tasks/tables_syncer.py | 10 +++------- backend/dataall/tasks/catalog_indexer.py | 17 ++++------------- backend/local_graphql_server.py | 3 +-- 9 files changed, 15 insertions(+), 39 deletions(-) diff --git a/backend/api_handler.py b/backend/api_handler.py index 890235347..714e107b2 100644 --- a/backend/api_handler.py +++ b/backend/api_handler.py @@ -15,7 +15,6 @@ from dataall.core.context import set_context, dispose_context, RequestContext from dataall.db import init_permissions, get_engine, api, permissions from dataall.modules.loader import load_modules, ImportMode -from dataall.searchproxy import connect logger = logging.getLogger() logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO')) @@ -30,7 +29,6 @@ TYPE_DEFS = gql(SCHEMA.gql(with_directives=False)) ENVNAME = os.getenv('envname', 'local') ENGINE = get_engine(envname=ENVNAME) -ES = connect(envname=ENVNAME) Worker.queue = SqsQueue.send init_permissions(ENGINE) @@ -99,7 +97,6 @@ def handler(event, context): log.info('Lambda Event %s', event) log.debug('Env name %s', ENVNAME) - log.debug('ElasticSearch %s', ES) log.debug('Engine %s', ENGINE.engine.url) if event['httpMethod'] == 'OPTIONS': @@ -137,11 +134,10 @@ def handler(event, context): print(f'Error managing groups due to: {e}') groups = [] - set_context(RequestContext(ENGINE, username, groups, ES)) + set_context(RequestContext(ENGINE, username, groups)) app_context = { 'engine': ENGINE, - 'es': ES, 'username': username, 'groups': groups, 'schema': SCHEMA, diff --git a/backend/dataall/api/Objects/Glossary/registry.py b/backend/dataall/api/Objects/Glossary/registry.py index 36fea6cf0..ef006a777 100644 --- a/backend/dataall/api/Objects/Glossary/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -1,7 +1,5 @@ -from dataclasses import dataclass, field -from typing import Type, Dict, Optional, Protocol, Union, Callable, Any - -from opensearchpy import OpenSearch +from dataclasses import dataclass +from typing import Type, Dict, Optional, Protocol, Union from dataall.api import gql from dataall.api.gql.graphql_union_type import UnionTypeRegistry @@ -56,7 +54,7 @@ def types(cls): return [gql.Ref(definition.object_type) for definition in cls._DEFINITIONS.values()] @classmethod - def reindex(cls, session, es: OpenSearch, target_type: str, target_uri: str): + def reindex(cls, session, target_type: str, target_uri: str): definition = cls._DEFINITIONS[target_type] if definition.reindexer: definition.reindexer.upsert(session, target_uri) diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index fdc4c3eea..42fae88ce 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -458,7 +458,7 @@ def reindex(context, linkUri): if not link: return - GlossaryRegistry.reindex(session, context.es, link.targetType, link.targetUri) + GlossaryRegistry.reindex(session, link.targetType, link.targetUri) def _target_model(target_type: str): diff --git a/backend/dataall/api/Objects/Vote/resolvers.py b/backend/dataall/api/Objects/Vote/resolvers.py index 42f5c20f5..d9f739872 100644 --- a/backend/dataall/api/Objects/Vote/resolvers.py +++ b/backend/dataall/api/Objects/Vote/resolvers.py @@ -28,11 +28,11 @@ def upvote(context: Context, source, input=None): data=input, check_perm=True, ) - reindex(session, context.es, vote) + reindex(session, vote) return vote -def reindex(session, es, vote): +def reindex(session, vote): if vote.targetType == 'dataset': DatasetIndexer.upsert(session=session, dataset_uri=vote.targetUri) elif vote.targetType == 'dashboard': diff --git a/backend/dataall/api/context.py b/backend/dataall/api/context.py index a210dc0a1..238627a81 100644 --- a/backend/dataall/api/context.py +++ b/backend/dataall/api/context.py @@ -2,11 +2,9 @@ class Context: def __init__( self, engine=None, - es=None, username=None, groups=None, ): self.engine = engine - self.es = es self.username = username self.groups = groups diff --git a/backend/dataall/core/context.py b/backend/dataall/core/context.py index dcf594896..a6cc2d4ba 100644 --- a/backend/dataall/core/context.py +++ b/backend/dataall/core/context.py @@ -12,7 +12,6 @@ from dataall.db.connection import Engine from threading import local -import opensearchpy _request_storage = local() @@ -24,7 +23,6 @@ class RequestContext: db_engine: Engine username: str groups: List[str] - es_engine: opensearchpy.OpenSearch def get_context() -> RequestContext: diff --git a/backend/dataall/modules/datasets/tasks/tables_syncer.py b/backend/dataall/modules/datasets/tasks/tables_syncer.py index 0974df585..4ed22425e 100644 --- a/backend/dataall/modules/datasets/tasks/tables_syncer.py +++ b/backend/dataall/modules/datasets/tasks/tables_syncer.py @@ -10,8 +10,6 @@ from dataall.db import models from dataall.modules.datasets.db.models import DatasetTable from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer -from dataall.searchproxy import indexers -from dataall.searchproxy.connect import connect from dataall.utils.alarm_service import AlarmService from dataall.modules.datasets.services.dataset_table import DatasetTableService @@ -22,7 +20,7 @@ log = logging.getLogger(__name__) -def sync_tables(engine, es=None): +def sync_tables(engine): with engine.scoped_session() as session: processed_tables = [] all_datasets: [models.Dataset] = db.api.Dataset.list_all_active_datasets( @@ -88,8 +86,7 @@ def sync_tables(engine, es=None): processed_tables.extend(tables) - if es: - DatasetTableIndexer.upsert_all(session, dataset_uri=dataset.datasetUri) + DatasetTableIndexer.upsert_all(session, dataset_uri=dataset.datasetUri) except Exception as e: log.error( f'Failed to sync tables for dataset ' @@ -113,5 +110,4 @@ def is_assumable_pivot_role(env: models.Environment): if __name__ == '__main__': ENVNAME = os.environ.get('envname', 'local') ENGINE = get_engine(envname=ENVNAME) - ES = connect(envname=ENVNAME) - sync_tables(engine=ENGINE, es=ES) + sync_tables(engine=ENGINE) diff --git a/backend/dataall/tasks/catalog_indexer.py b/backend/dataall/tasks/catalog_indexer.py index 5d32800c7..945bdd214 100644 --- a/backend/dataall/tasks/catalog_indexer.py +++ b/backend/dataall/tasks/catalog_indexer.py @@ -5,13 +5,9 @@ from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer from .. import db -from ..db import get_engine, exceptions -from ..db import models +from dataall.db import get_engine, models from dataall.searchproxy.indexers import DashboardIndexer -from ..searchproxy.connect import ( - connect, -) -from ..utils.alarm_service import AlarmService +from dataall.utils.alarm_service import AlarmService root = logging.getLogger() root.setLevel(logging.INFO) @@ -20,12 +16,8 @@ log = logging.getLogger(__name__) -def index_objects(engine, es): +def index_objects(engine): try: - if not es: - raise exceptions.AWSResourceNotFound( - action='CATALOG_INDEXER_TASK', message='ES configuration not found' - ) indexed_objects_counter = 0 with engine.scoped_session() as session: @@ -58,5 +50,4 @@ def index_objects(engine, es): if __name__ == '__main__': ENVNAME = os.environ.get('envname', 'local') ENGINE = get_engine(envname=ENVNAME) - ES = connect(envname=ENVNAME) - index_objects(engine=ENGINE, es=ES) + index_objects(engine=ENGINE) diff --git a/backend/local_graphql_server.py b/backend/local_graphql_server.py index 44f79a087..98e99cd73 100644 --- a/backend/local_graphql_server.py +++ b/backend/local_graphql_server.py @@ -86,12 +86,11 @@ def request_context(headers, mock=False): tenant_name='dataall', ) - set_context(RequestContext(engine, username, groups, es)) + set_context(RequestContext(engine, username, groups)) # TODO: remove when the migration to a new RequestContext API is complete. Used only for backward compatibility context = Context( engine=engine, - es=es, schema=schema, username=username, groups=groups, From f382a6897b8cb2b366c1d098dcf71d2dd2ea0d72 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 4 May 2023 10:58:15 +0200 Subject: [PATCH 64/67] Review remarks --- .../dataall/api/Objects/Glossary/registry.py | 2 +- .../datasets/aws/s3_location_client.py | 31 +++++++++++++++++++ .../datasets/handlers/s3_location_handler.py | 30 ++---------------- .../datasets/indexers/dataset_indexer.py | 2 +- .../datasets/indexers/location_indexer.py | 2 +- .../datasets/indexers/table_indexer.py | 2 +- backend/dataall/searchproxy/__init__.py | 1 - .../{upsert.py => base_indexer.py} | 7 +++-- backend/dataall/searchproxy/indexers.py | 3 +- .../share_managers/s3_share_manager.py | 14 +++------ .../share_processors/s3_process_share.py | 4 +-- 11 files changed, 51 insertions(+), 47 deletions(-) create mode 100644 backend/dataall/modules/datasets/aws/s3_location_client.py rename backend/dataall/searchproxy/{upsert.py => base_indexer.py} (88%) diff --git a/backend/dataall/api/Objects/Glossary/registry.py b/backend/dataall/api/Objects/Glossary/registry.py index 36fea6cf0..fb7e6edf7 100644 --- a/backend/dataall/api/Objects/Glossary/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -7,7 +7,7 @@ from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models from dataall.searchproxy.indexers import DashboardIndexer -from dataall.searchproxy.upsert import BaseIndexer +from dataall.searchproxy.base_indexer import BaseIndexer class Identifiable(Protocol): diff --git a/backend/dataall/modules/datasets/aws/s3_location_client.py b/backend/dataall/modules/datasets/aws/s3_location_client.py new file mode 100644 index 000000000..45385743d --- /dev/null +++ b/backend/dataall/modules/datasets/aws/s3_location_client.py @@ -0,0 +1,31 @@ +import logging + +from dataall.aws.handlers.sts import SessionHelper +from dataall.modules.datasets.db.models import DatasetStorageLocation + +log = logging.getLogger(__name__) + + +class S3LocationClient: + + def __init__(self, location: DatasetStorageLocation): + session = SessionHelper.remote_session(accountid=location.AWSAccountId) + self._client = session.client('s3', region_name=location.region) + self._location = location + + def create_bucket_prefix(self): + location = self._location + try: + response = self._client.put_object( + Bucket=location.S3BucketName, Body='', Key=location.S3Prefix + '/' + ) + log.info( + 'Creating S3 Prefix `{}`({}) on AWS #{}'.format( + location.S3BucketName, location.AWSAccountId, response + ) + ) + except Exception as e: + log.error( + f'Dataset storage location creation failed on S3 for dataset location {location.locationUri} : {e}' + ) + raise e diff --git a/backend/dataall/modules/datasets/handlers/s3_location_handler.py b/backend/dataall/modules/datasets/handlers/s3_location_handler.py index ba8cf6eda..296b7e33c 100644 --- a/backend/dataall/modules/datasets/handlers/s3_location_handler.py +++ b/backend/dataall/modules/datasets/handlers/s3_location_handler.py @@ -3,6 +3,7 @@ from dataall.aws.handlers.service_handlers import Worker from dataall.aws.handlers.sts import SessionHelper from dataall.db import models +from dataall.modules.datasets.aws.s3_location_client import S3LocationClient from dataall.modules.datasets.services.dataset_location import DatasetLocationService log = logging.getLogger(__name__) @@ -11,11 +12,6 @@ class S3DatasetLocationHandler: """Handles async requests related to s3 for dataset storage location""" - @staticmethod - def client(account_id: str, region: str, client_type: str): - session = SessionHelper.remote_session(accountid=account_id) - return session.client(client_type, region_name=region) - @staticmethod @Worker.handler(path='s3.prefix.create') def create_dataset_location(engine, task: models.Task): @@ -23,26 +19,6 @@ def create_dataset_location(engine, task: models.Task): location = DatasetLocationService.get_location_by_uri( session, task.targetUri ) - S3DatasetLocationHandler.create_bucket_prefix(location) - return location - - @staticmethod - def create_bucket_prefix(location): - try: - account_id = location.AWSAccountId - region = location.region - s3cli = S3DatasetLocationHandler.client(account_id=account_id, region=region, client_type='s3') - response = s3cli.put_object( - Bucket=location.S3BucketName, Body='', Key=location.S3Prefix + '/' - ) - log.info( - 'Creating S3 Prefix `{}`({}) on AWS #{}'.format( - location.S3BucketName, account_id, response - ) - ) + S3LocationClient(location).create_bucket_prefix() location.locationCreated = True - except Exception as e: - log.error( - f'Dataset storage location creation failed on S3 for dataset location {location.locationUri} : {e}' - ) - raise e + return location diff --git a/backend/dataall/modules/datasets/indexers/dataset_indexer.py b/backend/dataall/modules/datasets/indexers/dataset_indexer.py index 8cb0b7873..35de32e1c 100644 --- a/backend/dataall/modules/datasets/indexers/dataset_indexer.py +++ b/backend/dataall/modules/datasets/indexers/dataset_indexer.py @@ -3,7 +3,7 @@ from dataall import db from dataall.db import models from dataall.modules.datasets.services.dataset_location import DatasetLocationService -from dataall.searchproxy.upsert import BaseIndexer +from dataall.searchproxy.base_indexer import BaseIndexer class DatasetIndexer(BaseIndexer): diff --git a/backend/dataall/modules/datasets/indexers/location_indexer.py b/backend/dataall/modules/datasets/indexers/location_indexer.py index 72495b51c..f649a244b 100644 --- a/backend/dataall/modules/datasets/indexers/location_indexer.py +++ b/backend/dataall/modules/datasets/indexers/location_indexer.py @@ -3,7 +3,7 @@ from dataall.db import models from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer -from dataall.searchproxy.upsert import BaseIndexer +from dataall.searchproxy.base_indexer import BaseIndexer class DatasetLocationIndexer(BaseIndexer): diff --git a/backend/dataall/modules/datasets/indexers/table_indexer.py b/backend/dataall/modules/datasets/indexers/table_indexer.py index 1eab70a87..fec9e4f7c 100644 --- a/backend/dataall/modules/datasets/indexers/table_indexer.py +++ b/backend/dataall/modules/datasets/indexers/table_indexer.py @@ -3,7 +3,7 @@ from dataall.db import models from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer -from dataall.searchproxy.upsert import BaseIndexer +from dataall.searchproxy.base_indexer import BaseIndexer class DatasetTableIndexer(BaseIndexer): diff --git a/backend/dataall/searchproxy/__init__.py b/backend/dataall/searchproxy/__init__.py index 78493adb6..8dab74fea 100644 --- a/backend/dataall/searchproxy/__init__.py +++ b/backend/dataall/searchproxy/__init__.py @@ -4,5 +4,4 @@ __all__ = [ 'connect', 'run_query', - 'upsert', ] diff --git a/backend/dataall/searchproxy/upsert.py b/backend/dataall/searchproxy/base_indexer.py similarity index 88% rename from backend/dataall/searchproxy/upsert.py rename to backend/dataall/searchproxy/base_indexer.py index 9eb2e3125..fd0cb5e0e 100644 --- a/backend/dataall/searchproxy/upsert.py +++ b/backend/dataall/searchproxy/base_indexer.py @@ -21,7 +21,10 @@ class BaseIndexer(ABC): def es(cls): """Lazy creation of the OpenSearch connection""" if cls._es is None: - cls._es = connect(envname=os.getenv('envname', 'local')) + es = connect(envname=os.getenv('envname', 'local')) + if not es: + raise Exception('Failed to create ES connection') + cls._es = es return cls._es @@ -35,7 +38,7 @@ def _index(cls, doc_id, doc): es = cls.es() doc['_indexed'] = datetime.now() if es: - res = es.index(index=BaseIndexer._INDEX, id=doc_id, body=doc) + res = es.index(index=cls._INDEX, id=doc_id, body=doc) log.info(f'doc {doc} for id {doc_id} indexed with response {res}') return True else: diff --git a/backend/dataall/searchproxy/indexers.py b/backend/dataall/searchproxy/indexers.py index 13ba44eea..ce4145fc5 100644 --- a/backend/dataall/searchproxy/indexers.py +++ b/backend/dataall/searchproxy/indexers.py @@ -4,11 +4,12 @@ from .. import db from ..db import models -from dataall.searchproxy.upsert import BaseIndexer +from dataall.searchproxy.base_indexer import BaseIndexer log = logging.getLogger(__name__) +# TODO Should be moved to dashboard module class DashboardIndexer(BaseIndexer): @classmethod def upsert(cls, session, dashboard_uri: str): diff --git a/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py b/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py index 30c72a60e..fad1e801f 100644 --- a/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py +++ b/backend/dataall/tasks/data_sharing/share_managers/s3_share_manager.py @@ -398,12 +398,9 @@ def delete_dataset_bucket_key_policy( json.dumps(policy) ) - def handle_share_failure(self, error: Exception) -> None: + def log_share_failure(self, error: Exception) -> None: """ - Handles share failure by raising an alarm to alarmsTopic - Returns - ------- - True if alarm published successfully + Writes a log if the failure happened while sharing """ logger.error( f'Failed to share folder {self.s3_prefix} ' @@ -412,12 +409,9 @@ def handle_share_failure(self, error: Exception) -> None: f'due to: {error}' ) - def handle_revoke_failure(self, error: Exception) -> None: + def log_revoke_failure(self, error: Exception) -> None: """ - Handles share failure by raising an alarm to alarmsTopic - Returns - ------- - True if alarm published successfully + Writes a log if the failure happened while revoking share """ logger.error( f'Failed to revoke S3 permissions to folder {self.s3_prefix} ' diff --git a/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py b/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py index 96b608338..860aa8a69 100644 --- a/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py +++ b/backend/dataall/tasks/data_sharing/share_processors/s3_process_share.py @@ -92,7 +92,7 @@ def process_approved_shares( shared_item_SM.update_state_single_item(session, sharing_item, new_state) except Exception as e: - sharing_folder.handle_share_failure(e) + sharing_folder.log_share_failure(e) new_state = shared_item_SM.run_transition(models.Enums.ShareItemActions.Failure.value) shared_item_SM.update_state_single_item(session, sharing_item, new_state) success = False @@ -155,7 +155,7 @@ def process_revoked_shares( revoked_item_SM.update_state_single_item(session, removing_item, new_state) except Exception as e: - removing_folder.handle_revoke_failure(e) + removing_folder.log_revoke_failure(e) new_state = revoked_item_SM.run_transition(models.Enums.ShareItemActions.Failure.value) revoked_item_SM.update_state_single_item(session, removing_item, new_state) success = False From 532ff0d678c1d6b842a3ad1e39e03321bcef9b5d Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 4 May 2023 11:56:36 +0200 Subject: [PATCH 65/67] Added TODO --- .../dataall/modules/datasets/handlers/glue_profiling_handler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/dataall/modules/datasets/handlers/glue_profiling_handler.py b/backend/dataall/modules/datasets/handlers/glue_profiling_handler.py index d15607733..be0915331 100644 --- a/backend/dataall/modules/datasets/handlers/glue_profiling_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_profiling_handler.py @@ -68,6 +68,7 @@ def start_profiling_run(engine, task: models.Task): ) return run + # TODO move to client once dataset is migrated @staticmethod def get_job_run(**data): accountid = data['accountid'] From 72864d6ef574a86d88d45158a13b57fa76d4a7c1 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 4 May 2023 13:56:10 +0200 Subject: [PATCH 66/67] Fixed imports --- .../dataall/modules/datasets/handlers/glue_column_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index c8a5cb848..6882d4e12 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -18,7 +18,7 @@ class DatasetColumnGlueHandler: @Worker.handler('glue.table.columns') def get_table_columns(engine, task: models.Task): with engine.scoped_session() as session: - dataset_table: models.DatasetTable = session.query(models.DatasetTable).get( + dataset_table: DatasetTable = session.query(DatasetTable).get( task.targetUri ) aws = SessionHelper.remote_session(dataset_table.AWSAccountId) @@ -34,7 +34,7 @@ def get_table_columns(engine, task: models.Task): def update_table_columns(engine, task: models.Task): with engine.scoped_session() as session: column: DatasetTableColumn = session.query(DatasetTableColumn).get(task.targetUri) - table: DatasetTable = session.query(models.DatasetTable).get(column.tableUri) + table: DatasetTable = session.query(DatasetTable).get(column.tableUri) aws_session = SessionHelper.remote_session(table.AWSAccountId) From 07e6975ba3cda0740a3a4b110684e92d0bc8fce2 Mon Sep 17 00:00:00 2001 From: nikpodsh <124577300+nikpodsh@users.noreply.github.com> Date: Thu, 4 May 2023 14:16:27 +0200 Subject: [PATCH 67/67] Update tests/api/conftest.py Co-authored-by: dbalintx <132444646+dbalintx@users.noreply.github.com> --- tests/api/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/api/conftest.py b/tests/api/conftest.py index 37fef4f10..52f445502 100644 --- a/tests/api/conftest.py +++ b/tests/api/conftest.py @@ -40,7 +40,7 @@ def patch_es(module_mocker): return_value={} ) module_mocker.patch('dataall.searchproxy.indexers.DashboardIndexer.upsert', return_value={}) - module_mocker.patch('dataall.searchproxy.upsert.BaseIndexer.delete_doc', return_value={}) + module_mocker.patch('dataall.searchproxy.base_indexer.BaseIndexer.delete_doc', return_value={}) @pytest.fixture(scope='module', autouse=True)