Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generic dataset module and specific s3_datasets module - part 2 (Create datasets_base and move enums) #1257

Merged
merged 11 commits into from
May 15, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
)

from dataall.modules.s3_datasets.db.dataset_models import Dataset
from dataall.modules.s3_datasets.services.datasets_enums import DatasetRole
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole
from dataall.modules.s3_datasets.services.dataset_service import DatasetServiceInterface


Expand Down
18 changes: 18 additions & 0 deletions backend/dataall/modules/datasets_base/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from typing import Set
from dataall.base.loader import ModuleInterface, ImportMode


class DatasetBaseModuleInterface(ModuleInterface):
@staticmethod
def is_supported(modes: Set[ImportMode]) -> bool:
supported_modes = {
noah-paige marked this conversation as resolved.
Show resolved Hide resolved
ImportMode.API,
ImportMode.CDK,
ImportMode.HANDLERS,
ImportMode.STACK_UPDATER_TASK,
ImportMode.CATALOG_INDEXER_TASK,
}
return modes & supported_modes

def __init__(self):
import dataall.modules.datasets_base.services.datasets_enums
Empty file.
Empty file.
Empty file.
24 changes: 22 additions & 2 deletions backend/dataall/modules/s3_datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ def is_supported(modes):

@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.datasets_base import DatasetBaseModuleInterface
from dataall.modules.catalog import CatalogApiModuleInterface
from dataall.modules.feed import FeedApiModuleInterface
from dataall.modules.vote import VoteApiModuleInterface

return [
DatasetBaseModuleInterface,
CatalogApiModuleInterface,
FeedApiModuleInterface,
VoteApiModuleInterface,
Expand Down Expand Up @@ -85,12 +87,17 @@ class DatasetAsyncHandlersModuleInterface(ModuleInterface):
def is_supported(modes: Set[ImportMode]):
return ImportMode.HANDLERS in modes

@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.datasets_base import DatasetBaseModuleInterface

return [DatasetBaseModuleInterface]

def __init__(self):
import dataall.modules.s3_datasets.handlers
import dataall.modules.s3_datasets.db.dataset_models
import dataall.modules.s3_datasets.db.dataset_repositories
import dataall.modules.s3_datasets.services.dataset_permissions
import dataall.modules.s3_datasets.services.datasets_enums

log.info('Dataset handlers have been imported')

Expand All @@ -102,6 +109,12 @@ class DatasetCdkModuleInterface(ModuleInterface):
def is_supported(modes: Set[ImportMode]):
return ImportMode.CDK in modes

@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.datasets_base import DatasetBaseModuleInterface

return [DatasetBaseModuleInterface]

def __init__(self):
import dataall.modules.s3_datasets.cdk
from dataall.core.environment.cdk.environment_stack import EnvironmentSetup
Expand All @@ -119,6 +132,12 @@ class DatasetStackUpdaterModuleInterface(ModuleInterface):
def is_supported(modes: Set[ImportMode]) -> bool:
return ImportMode.STACK_UPDATER_TASK in modes

@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.datasets_base import DatasetBaseModuleInterface

return [DatasetBaseModuleInterface]

def __init__(self):
from dataall.modules.s3_datasets.tasks.dataset_stack_finder import DatasetStackFinder

Expand All @@ -134,8 +153,9 @@ def is_supported(modes: Set[ImportMode]) -> bool:
@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.catalog import CatalogIndexerModuleInterface
from dataall.modules.datasets_base import DatasetBaseModuleInterface

return [CatalogIndexerModuleInterface]
return [CatalogIndexerModuleInterface, DatasetBaseModuleInterface]

def __init__(self):
from dataall.modules.s3_datasets.indexers.dataset_catalog_indexer import DatasetCatalogIndexer
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataall.base.api import gql
from dataall.base.api.constants import SortDirection
from dataall.modules.s3_datasets.services.datasets_enums import DatasetSortField
from dataall.modules.datasets_base.services.datasets_enums import DatasetSortField


NewDatasetInput = gql.InputType(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from dataall.core.organizations.db.organization_repositories import OrganizationRepository
from dataall.base.db.exceptions import RequiredParameter, InvalidInput
from dataall.modules.s3_datasets.db.dataset_models import Dataset
from dataall.modules.s3_datasets.services.datasets_enums import DatasetRole, ConfidentialityClassification
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole, ConfidentialityClassification
from dataall.modules.s3_datasets.services.dataset_service import DatasetService

log = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion backend/dataall/modules/s3_datasets/api/dataset/types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataall.base.api import gql
from dataall.modules.s3_datasets.services.datasets_enums import DatasetRole
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole
from dataall.modules.s3_datasets.api.dataset.resolvers import (
get_dataset_environment,
get_dataset_organization,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataall.base.api import gql
from dataall.base.api.constants import SortDirection
from dataall.modules.s3_datasets.services.datasets_enums import DatasetSortField
from dataall.modules.datasets_base.services.datasets_enums import DatasetSortField


ModifyDatasetTableInput = gql.InputType(
Expand Down
2 changes: 1 addition & 1 deletion backend/dataall/modules/s3_datasets/db/dataset_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from sqlalchemy.dialects.postgresql import JSON, ARRAY
from sqlalchemy.orm import query_expression
from dataall.base.db import Base, Resource, utils
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification, Language
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification, Language


class DatasetTableColumn(Resource, Base):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dataall.core.organizations.db.organization_repositories import OrganizationRepository
from dataall.base.db import paginate
from dataall.base.db.exceptions import ObjectNotFound
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification, Language
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification, Language
from dataall.core.environment.services.environment_resource_manager import EnvironmentResource
from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, Dataset, DatasetLock
from dataall.base.utils.naming_convention import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from dataall.modules.s3_datasets.services.dataset_permissions import UPDATE_DATASET_TABLE
from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, DatasetTableColumn
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification
from dataall.modules.s3_datasets.services.dataset_permissions import PREVIEW_DATASET_TABLE


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from dataall.modules.s3_datasets.db.dataset_table_repositories import DatasetTableRepository
from dataall.modules.s3_datasets.services.dataset_permissions import PROFILE_DATASET_TABLE, GET_DATASET
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification
from dataall.modules.s3_datasets.db.dataset_models import DatasetProfilingRun, DatasetTable
from dataall.modules.s3_datasets.services.dataset_permissions import PREVIEW_DATASET_TABLE

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
IMPORT_DATASET,
)
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.services.datasets_enums import DatasetRole
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole
from dataall.modules.s3_datasets.db.dataset_models import Dataset, DatasetTable
from dataall.modules.s3_datasets.services.dataset_permissions import DATASET_TABLE_READ

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
SYNC_DATASET,
)
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification
from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, Dataset
from dataall.modules.s3_datasets.services.dataset_permissions import (
PREVIEW_DATASET_TABLE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sqlalchemy.ext.declarative import declarative_base

from dataall.base.db import utils, Resource
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification, Language
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification, Language


revision = '5e5c84138af7'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from dataall.modules.dataset_sharing.services.dataset_sharing_enums import ShareObjectStatus
from datetime import datetime

from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification, Language
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification, Language


# revision identifiers, used by Alembic.
Expand Down
2 changes: 1 addition & 1 deletion backend/migrations/versions/97050ec09354_release_3_7_8.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sqlalchemy.ext.declarative import declarative_base

from dataall.base.db import utils, Resource
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification, Language
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification, Language

# revision identifiers, used by Alembic.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
NamingConventionService,
NamingConventionPattern,
)
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification, Language
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification, Language

# revision identifiers, used by Alembic.
revision = 'e1cd4927482b'
Expand Down
3 changes: 3 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
"datapipelines": {
"active": true
},
"datasets_base": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ideally we would have dataset_base be automatically activated if any of the child dataset modules (i.e. s3_datasets) is active and not have to expose it here on the config.json

Configuring the dataset_base module as active or not does not really mean anything as it is really the child dataset modules that are meaningful to activate or not

I am not sure how easy it is to do such a thing or if it requires a big shift in the loader logic but if an easy fix

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I saw your comment that it is required here because of how we load ImportModes of CDK and CDK_CLI_EXTENSION - but can you explain why that is / is there an easy way to resolve?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First of all, datasets_base will be added in the config.json at some point (check the full design steps in #1123) because some parameters are relevant for all datasets no only for s3_datasets or redshift_datasets (e.g. confidentiality). That is why I did not give it more thought.

But if you are curious, the issue is a bit hidden, the loader has some methods to check that it initialized the modules correctly. With the current implementation I think that a module cannot have a module interface CDK and not have a CDK_CLI_EXTENSION if it is not a config.json module. I think the problem is not that much on the loader but on the way we are using it in the cdk proxy. I will have a closer look

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In earlier versions such as v2.4.0 I think we never exposed datasets_base as an module in config.json and we had the same type of __init__.py function where it was only supported by CDK and not CDK_CLI_EXTENSION

Trying to understand why we could do that then but can not now - playing around a bit on how these modules are imported using the loader class

Either way I tested these changes and it looks good so will approve PR but do some more digging here on the side

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I found the issue as to why there is an error when we remove datasets_base from config.json:

  • When we load_modules(modes={ImportMode.CDK_CLI_EXTENSION})
    • We first import all modules in_config as part of def _load_modules(...)
    • This would theoretically include s3_datasets and dataset_sharing at this point in time
    • When we import dataset_sharing we have the following line in top level of __init__.py that subsequently imports s3_datasets and datasets_base modules
from dataall.modules.dataset_sharing.db.share_object_repositories import ShareEnvironmentResource
  • Then when we _check_loading_correct(...) in loader.py
    • it fails on L252 with ImportError since datasets_base is in sys.modules.keys() but is not in checked_modules_names (since datasets_base is not in config / expected_load)

Previously we never exposed datasets_base or datasets_sharing as configurable modules in config.json so we avoided this problem. Moving the line

from dataall.modules.dataset_sharing.db.share_object_repositories import ShareEnvironmentResource

in backend/dataall/modules/dataset_sharing/__init__.py to within the the def __init__(self): in class SharingApiModuleInterface() resolves this issue on loading modules but may need some additional testing to ensure share APIs still working as expected

"active": true
},
"s3_datasets": {
"active": true,
"features": {
Expand Down
6 changes: 3 additions & 3 deletions tests/modules/datasets/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from dataall.modules.dataset_sharing.services.dataset_sharing_enums import ShareableType, PrincipalType
from dataall.modules.dataset_sharing.db.share_object_models import ShareObject, ShareObjectItem
from dataall.modules.dataset_sharing.services.share_permissions import SHARE_OBJECT_REQUESTER, SHARE_OBJECT_APPROVER
from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification
from dataall.modules.s3_datasets.services.dataset_permissions import DATASET_TABLE_READ
from dataall.modules.s3_datasets.db.dataset_models import Dataset, DatasetTable, DatasetStorageLocation
from dataall.modules.s3_datasets.services.dataset_permissions import DATASET_ALL
Expand All @@ -35,13 +35,13 @@ def patch_dataset_methods(module_mocker):
glue_mock_client().run_job.return_value = True

module_mocker.patch(
'dataall.modules.s3_datasets.services.datasets_enums.ConfidentialityClassification.validate_confidentiality_level',
'dataall.modules.datasets_base.services.datasets_enums.ConfidentialityClassification.validate_confidentiality_level',
return_value=True,
)

confidentiality_classification_mocker = MagicMock()
module_mocker.patch(
'dataall.modules.s3_datasets.services.datasets_enums.ConfidentialityClassification',
'dataall.modules.datasets_base.services.datasets_enums.ConfidentialityClassification',
return_value=confidentiality_classification_mocker,
)
# Return the input when mocking. This mock avoids checking the custom_confidentiality_mapping value in the actual function and just returns whatever confidentiality value is supplied for pytests
Expand Down
2 changes: 1 addition & 1 deletion tests/modules/datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset, DatasetLock
from tests.core.stacks.test_stack import update_stack_query

from dataall.modules.s3_datasets.services.datasets_enums import ConfidentialityClassification
from dataall.modules.datasets_base.services.datasets_enums import ConfidentialityClassification


mocked_key_id = 'some_key'
Expand Down