From dc582b31c749cfa41210cf9e93502a1ba9c94a6e Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 08:52:01 +0000 Subject: [PATCH 01/10] restructure --- src/source_msgraph/client.py | 103 ------------------ .../{ => core}/async_interator.py | 0 src/source_msgraph/core/base_client.py | 97 +++++++++++++++++ src/source_msgraph/{ => core}/constants.py | 0 src/source_msgraph/{ => core}/models.py | 3 +- src/source_msgraph/core/resource_provider.py | 50 +++++++++ src/source_msgraph/core/source.py | 53 +++++++++ src/source_msgraph/{ => core}/utils.py | 0 src/source_msgraph/generate_docs.py | 84 -------------- src/source_msgraph/resources.py | 30 ----- src/source_msgraph/resources/list_items.py | 22 ++++ src/source_msgraph/source.py | 63 ----------- 12 files changed, 223 insertions(+), 282 deletions(-) delete mode 100644 src/source_msgraph/client.py rename src/source_msgraph/{ => core}/async_interator.py (100%) create mode 100644 src/source_msgraph/core/base_client.py rename src/source_msgraph/{ => core}/constants.py (100%) rename src/source_msgraph/{ => core}/models.py (98%) create mode 100644 src/source_msgraph/core/resource_provider.py create mode 100644 src/source_msgraph/core/source.py rename src/source_msgraph/{ => core}/utils.py (100%) delete mode 100644 src/source_msgraph/generate_docs.py delete mode 100644 src/source_msgraph/resources.py create mode 100644 src/source_msgraph/resources/list_items.py delete mode 100644 src/source_msgraph/source.py diff --git a/src/source_msgraph/client.py b/src/source_msgraph/client.py deleted file mode 100644 index b429073..0000000 --- a/src/source_msgraph/client.py +++ /dev/null @@ -1,103 +0,0 @@ -from msgraph import GraphServiceClient -from kiota_abstractions.base_request_configuration import RequestConfiguration -from msgraph.generated.models.o_data_errors.o_data_error import ODataError -from azure.identity import ClientSecretCredential -from source_msgraph.async_interator import AsyncToSyncIterator -from source_msgraph.models import ConnectorOptions -from source_msgraph.utils import get_python_schema, to_json, to_pyspark_schema -from typing import Dict, Any - -class GraphClient: - def __init__(self, options: ConnectorOptions): - """ - Initializes the fetcher with the Graph client, resource path, and query parameters. - - - :param options: Connector options. - """ - credentials = ClientSecretCredential(options.tenant_id, options.client_id, options.client_secret) - self.graph_client = GraphServiceClient(credentials=credentials) - self.options: ConnectorOptions = options - - - async def fetch_data(self): - """ - Fetches data from Microsoft Graph using the dynamically built request. - Handles pagination automatically. - """ - query_parameters_cls = self.options.resource.get_query_parameters_cls() - - if query_parameters_cls: - try: - query_parameters_instance = query_parameters_cls() # Ensure it can be instantiated without arguments - except TypeError as e: - raise ValueError(f"Failed to instantiate {query_parameters_cls.__name__}: {e}") - - if self.options.resource.query_params: - for k, v in self.options.resource.query_params.items(): - k = k.removeprefix("%24") - if hasattr(query_parameters_instance, k): - setattr(query_parameters_instance, k, v) # Set attributes dynamically - else: - raise AttributeError(f"{query_parameters_cls.__name__} has no attribute '{k}'") - - request_configuration = RequestConfiguration( - query_parameters=query_parameters_instance - ) - - try: - builder = self.options.resource.get_request_builder_cls()(self.graph_client.request_adapter, self.options.resource.resource_params) - items = await builder.get(request_configuration=request_configuration) - while True: - for item in items.value: - yield item - if not items.odata_next_link: - break - items = await builder.with_url(items.odata_next_link).get() - - except ODataError as e: - raise Exception(f"Graph API Error: {e.error.message}") - - -def iter_records(options: ConnectorOptions): - """ - Iterates over records from the Microsoft Graph API. - - :param options: Connector options containing authentication credentials and resource details. - :return: A synchronous iterator over the fetched data. - :raises ValueError: If any required credentials or resource parameters are missing. - :raises GraphAPIError: If the API request fails. - """ - fetcher = GraphClient(options) - async_gen = fetcher.fetch_data() - return AsyncToSyncIterator(async_gen) - - - -def get_resource_schema(options: ConnectorOptions) -> Dict[str, Any]: - """ - Retrieves the schema of a Microsoft Graph API resource by fetching a single record. - - :param options: Connector options containing authentication credentials and resource details. - :return: A dictionary representing the schema of the resource. - :raises ValueError: If no records are found or if required options are missing. - :raises GraphAPIError: If the API request fails. - """ - fetcher = GraphClient(options) - async_gen = fetcher.fetch_data() - - try: - record = next(AsyncToSyncIterator(async_gen), None) - if not record: - raise ValueError(f"No records found for resource: {options.resource.resource_name}") - record = to_json(record) - schema = to_pyspark_schema(get_python_schema(record)) - return record, schema - - except StopIteration: - raise ValueError(f"No records available for {options.resource.resource_name}") - -# Example usage -# options = ConnectorOptions(...) -# schema = get_resource_schema(options) -# print(json.dumps(schema, indent=2)) diff --git a/src/source_msgraph/async_interator.py b/src/source_msgraph/core/async_interator.py similarity index 100% rename from src/source_msgraph/async_interator.py rename to src/source_msgraph/core/async_interator.py diff --git a/src/source_msgraph/core/base_client.py b/src/source_msgraph/core/base_client.py new file mode 100644 index 0000000..6fceb1e --- /dev/null +++ b/src/source_msgraph/core/base_client.py @@ -0,0 +1,97 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict +from msgraph import GraphServiceClient +from kiota_abstractions.base_request_configuration import RequestConfiguration +from msgraph.generated.models.o_data_errors.o_data_error import ODataError +from source_msgraph.core.async_interator import AsyncToSyncIterator +from source_msgraph.core.models import BaseResource +from source_msgraph.core.utils import get_python_schema, to_json, to_pyspark_schema + +from azure.identity import DefaultAzureCredential, EnvironmentCredential + +class BaseResourceProvider(ABC): + def __init__(self, options: Dict[str, Any]): + """ + Initializes the fetcher with the Graph client, resource path, and query parameters. + + :param options: Connector options. + """ + self.options = options + credentials = DefaultAzureCredential() + self.graph_client = GraphServiceClient(credentials=credentials) + + async def fetch_data(self): + """ + Fetches data from Microsoft Graph using the dynamically built request. + Handles pagination automatically. + """ + query_parameters_cls = self.resource.get_query_parameters_cls() + + if query_parameters_cls: + try: + query_parameters_instance = query_parameters_cls() # Ensure it can be instantiated without arguments + except TypeError as e: + raise ValueError(f"Failed to instantiate {query_parameters_cls.__name__}: {e}") + + if self.resource.query_params: + for k, v in self.resource.query_params.items(): + k = k.removeprefix("%24") + if hasattr(query_parameters_instance, k): + setattr(query_parameters_instance, k, v) # Set attributes dynamically + else: + raise AttributeError(f"{query_parameters_cls.__name__} has no attribute '{k}'") + + request_configuration = RequestConfiguration( + query_parameters=query_parameters_instance + ) + + try: + builder = self.resource.get_request_builder_cls()(self.graph_client.request_adapter, self.resource.resource_params) + items = await builder.get(request_configuration=request_configuration) + while True: + for item in items.value: + yield item + if not items.odata_next_link: + break + items = await builder.with_url(items.odata_next_link).get() + + except ODataError as e: + raise Exception(f"Graph API Error: {e.error.message}") + + def iter_records(self): + """ + Iterates over records from the Microsoft Graph API. + + :param options: Connector options containing authentication credentials and resource details. + :return: A synchronous iterator over the fetched data. + :raises ValueError: If any required credentials or resource parameters are missing. + :raises GraphAPIError: If the API request fails. + """ + async_gen = self.fetch_data() + return AsyncToSyncIterator(async_gen) + + def get_resource_schema(self) -> Dict[str, Any]: + """ + Retrieves the schema of a Microsoft Graph API resource by fetching a single record. + + :param options: Connector options containing authentication credentials and resource details. + :return: A dictionary representing the schema of the resource. + :raises ValueError: If no records are found or if required options are missing. + :raises GraphAPIError: If the API request fails. + """ + async_gen = self.fetch_data() + + try: + record = next(AsyncToSyncIterator(async_gen), None) + if not record: + raise ValueError(f"No records found for resource: {self.resource.resource_name}") + record = to_json(record) + schema = to_pyspark_schema(get_python_schema(record)) + return record, schema + + except StopIteration: + raise ValueError(f"No records available for {self.resource.resource_name}") + + @abstractmethod + def resource(self) -> BaseResource: + ... \ No newline at end of file diff --git a/src/source_msgraph/constants.py b/src/source_msgraph/core/constants.py similarity index 100% rename from src/source_msgraph/constants.py rename to src/source_msgraph/core/constants.py diff --git a/src/source_msgraph/models.py b/src/source_msgraph/core/models.py similarity index 98% rename from src/source_msgraph/models.py rename to src/source_msgraph/core/models.py index 1f5c046..964dc7e 100644 --- a/src/source_msgraph/models.py +++ b/src/source_msgraph/core/models.py @@ -3,7 +3,7 @@ import inspect import re from typing import Any, Dict -from source_msgraph.constants import MSGRAPH_SDK_PACKAGE +from source_msgraph.core.constants import MSGRAPH_SDK_PACKAGE from urllib.parse import unquote from kiota_abstractions.base_request_builder import BaseRequestBuilder @@ -157,7 +157,6 @@ class ConnectorOptions: tenant_id: str client_id: str client_secret: str - resource: BaseResource def __post_init__(self): ... diff --git a/src/source_msgraph/core/resource_provider.py b/src/source_msgraph/core/resource_provider.py new file mode 100644 index 0000000..460d156 --- /dev/null +++ b/src/source_msgraph/core/resource_provider.py @@ -0,0 +1,50 @@ +from functools import lru_cache +import importlib +import pkgutil +from typing import Dict, Type +from source_msgraph.core.base_client import BaseResourceProvider + + +def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: + """ + Dynamically loads all resource providers from the resources package + """ + providers = {} + package = 'source_msgraph.resources' + + # Import the resources package + resources_pkg = importlib.import_module(package) + + # Iterate through all submodules + for _, name, _ in pkgutil.iter_modules(resources_pkg.__path__): + if name != 'base': # Skip the base module + try: + # Import the module + module = importlib.import_module(f'{package}.{name}') + # Look for *ResourceProvider class + for attr_name in dir(module): + if attr_name.endswith('ResourceProvider'): + provider_class = getattr(module, attr_name) + if (isinstance(provider_class, type) and + issubclass(provider_class, BaseResourceProvider) and + provider_class != BaseResourceProvider): + providers[name] = provider_class + except ImportError as e: + print(f"Warning: Could not load resource provider {name}: {e}") + + return providers + +def get_resource_provider(resource_name: str, options: Dict[str, str]) -> BaseResourceProvider: + """ + Factory method to get the appropriate resource provider + """ + providers = load_resource_providers() + provider_class: BaseResourceProvider = providers.get(resource_name) + + if not provider_class: + available = ', '.join(providers.keys()) + raise ValueError( + f"Unsupported resource name: '{resource_name}'. " + f"Available resources: {available}" + ) + return provider_class(options) \ No newline at end of file diff --git a/src/source_msgraph/core/source.py b/src/source_msgraph/core/source.py new file mode 100644 index 0000000..1a79af3 --- /dev/null +++ b/src/source_msgraph/core/source.py @@ -0,0 +1,53 @@ +import logging +from typing import Any, Dict, Union +from pyspark.sql.datasource import DataSource, DataSourceReader +from pyspark.sql.types import StructType +from source_msgraph.core.base_client import BaseResourceProvider + +from source_msgraph.core.resource_provider import get_resource_provider + +# Reference https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources + +logger = logging.getLogger(__name__) + +class MSGraphDataSource(DataSource): + """ + + """ + def __init__(self, options: Dict[str, Any]): + + self.resource_name = options.pop("resource") + if not self.resource_name: + raise ValueError("resource is missing, please provide a valid resource name.") + self.options = options + + @classmethod + def name(cls): + return "msgraph" + + def schema(self): + logger.info("Schema not provided, infering from the source.") + resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + _, schema = resource_provider.get_resource_schema() + logger.debug(f"Infered schema : {schema}") + return schema + + def reader(self, schema: StructType): + return MSGraphDataSourceReader(self.resource_name, self.options, schema) + + +class MSGraphDataSourceReader(DataSourceReader): + + def __init__(self, resource_name :str, options: Dict[str, Any], schema: Union[StructType, str]): + self.schema: StructType = schema + self.options = options + self.resource_name = resource_name + + def read(self, partition): + from source_msgraph.core.utils import to_json + from pyspark.sql import Row + resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + for row in resource_provider.iter_records(): + row = to_json(row) + row_data = {f.name: row.get(f.name, None) for f in self.schema.fields} + yield Row(**row_data) diff --git a/src/source_msgraph/utils.py b/src/source_msgraph/core/utils.py similarity index 100% rename from src/source_msgraph/utils.py rename to src/source_msgraph/core/utils.py diff --git a/src/source_msgraph/generate_docs.py b/src/source_msgraph/generate_docs.py deleted file mode 100644 index 0903c63..0000000 --- a/src/source_msgraph/generate_docs.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -from urllib.parse import unquote -from source_msgraph.models import BaseResource -from source_msgraph.resources import RESOURCE_CONFIGS - -def generate_markdown(resource: BaseResource) -> str: - """ - Generates sophisticated markdown documentation for a given BaseResource. - """ - md_content = [f"# {resource.name.capitalize()} Resource", ""] - md_content.append(f"**Resource Name:** `{resource.name.lower()}`") - - - md_content.append("\n## Overview") - md_content.append(f"The `{resource.name}` resource provides a structured way to interact with Microsoft Graph API.") - md_content.append("This resource supports operations such as retrieval and filtering of data.") - - md_content.append("\n## Resource Parameters") - if len(resource.resource_params.keys()) > 0: - md_content.append("| Parameter | Type | Required | Description |") - md_content.append("|-----------|------|----------|-------------|") - for param in resource.resource_params or {}: - md_content.append(f"| `{unquote(param)}` | `str` | ✅ | Required path parameter for resource access. |") - else: - md_content.append(f"> No parameters required for `{resource.name.lower()}` resource.") - - - md_content.append("\n## Query Parameters") - if len(resource.query_params.keys()) > 0: - md_content.append("| Parameter | Type | Required | Description |") - md_content.append("|-----------|------|----------|-------------|") - for param in resource.query_params or {}: - md_content.append(f"| `{unquote(param)}` | `str` | ❌ | Optional query parameter to refine the API request. |") - else: - md_content.append(f">> No query parameters are required for `{resource.name.lower()}` resource.") - - md_content.append("---") - - md_content.append("Tip: Please refer [Microsoft Graph API]() documentation if you don't see a field. This can be resolved by provising `expand` option.") - - md_content.append("\n## Example Usage") - md_content.append("```python") - md_content.append("from source_msgraph.source import MSGraphDataSource") - md_content.append("spark.dataSource.register(MSGraphDataSource)") - md_content.append("") - md_content.append("# Read data using Microsoft Graph") - md_content.append("df = spark.read.format(\"msgraph\") ") - md_content.append(" .option(\"tenant_id\", tenant_id)") - md_content.append(" .option(\"client_id\", client_id)") - md_content.append(" .option(\"client_secret\", client_secret)") - md_content.append(f" .option(\"resource\", \"{resource.name}\")") - for param in resource.resource_params or {}: - md_content.append(f" .option(\"{param}\", \"\")") - for param in resource.query_params or {}: - md_content.append(f" .option(\"{param}\", \"\")") - md_content.append(" .schema(\"id string, eTag string\")") - md_content.append(" .load()") - md_content.append("") - md_content.append("df.show()") - md_content.append("```") - - return "\n".join(md_content) - -def generate_docs(output_dir: str = "docs"): - """ - Generates sophisticated markdown documentation for all configured resources. - """ - os.makedirs(output_dir, exist_ok=True) - - for config in RESOURCE_CONFIGS: - resource = BaseResource( - name=config["name"], - resource_name=config["resource_name"], - request_builder_module=config["request_builder_module"] - ) - - md_content = generate_markdown(resource) - file_path = os.path.join(output_dir, f"{resource.name}.md") - with open(file_path, "w", encoding="utf-8") as f: - f.write(md_content) - print(f"Generated documentation: {file_path}") - -if __name__ == "__main__": - generate_docs() \ No newline at end of file diff --git a/src/source_msgraph/resources.py b/src/source_msgraph/resources.py deleted file mode 100644 index 43e5b08..0000000 --- a/src/source_msgraph/resources.py +++ /dev/null @@ -1,30 +0,0 @@ -# Define the resources to generate -from source_msgraph.models import BaseResource - - -RESOURCE_CONFIGS = [ - {"name": "sites", "resource_name": "sites", "request_builder_module": "sites.sites_request_builder"}, - {"name": "lists", "resource_name": "lists", "request_builder_module": "sites.item.lists.lists_request_builder"}, - {"name": "list_items", "resource_name": "items", "request_builder_module": "sites.item.lists.item.items.items_request_builder"}, -] - - - -def get_resource(name: str): - """ - Generates a list of BaseResource instances for specified Microsoft Graph resources. - """ - config = next((config for config in RESOURCE_CONFIGS if config["name"] == name), None) - if not config: - raise ValueError(f"Resource '{name}' is not supported yet. stay tuned!") - - # Create and store the BaseResource instance - resource = BaseResource( - name=config["name"], - resource_name=config["resource_name"], - request_builder_module=config["request_builder_module"] - ) - return resource - - - diff --git a/src/source_msgraph/resources/list_items.py b/src/source_msgraph/resources/list_items.py new file mode 100644 index 0000000..6302d53 --- /dev/null +++ b/src/source_msgraph/resources/list_items.py @@ -0,0 +1,22 @@ +from functools import cached_property +from typing import Dict + +from source_msgraph.core.base_client import BaseResourceProvider +from source_msgraph.core.models import BaseResource + + +class ListItemsResourceProvider(BaseResourceProvider): + + def __init__(self, options: Dict[str, str]): + self.options = options + super().__init__(options) + + @cached_property + def resource(self) -> BaseResource: + return BaseResource( + name="list_items", + resource_name="items", + request_builder_module="sites.item.lists.item.items.items_request_builder" + ).map_options_to_params(self.options) + + diff --git a/src/source_msgraph/source.py b/src/source_msgraph/source.py deleted file mode 100644 index 7366e3d..0000000 --- a/src/source_msgraph/source.py +++ /dev/null @@ -1,63 +0,0 @@ -import logging -from typing import Any, Dict, Union -from pyspark.sql.datasource import DataSource, DataSourceReader -from pyspark.sql.types import StructType -from source_msgraph.client import get_resource_schema, iter_records -from source_msgraph.models import ConnectorOptions - -from source_msgraph.resources import get_resource -# Reference https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources - -logger = logging.getLogger(__name__) - -class MSGraphDataSource(DataSource): - """ - - """ - def __init__(self, options: Dict[str, Any]): - - tenant_id=options.pop("tenant_id") - client_id=options.pop("client_id") - client_secret=options.pop("client_secret") - - resource_name = options.pop("resource") - if not resource_name: - raise ValueError("resource is missing, please provide a valid resource name.") - - resource = get_resource(resource_name).map_options_to_params(options) - - self.connector_options: ConnectorOptions = ConnectorOptions( - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret, - resource=resource - ) - - - @classmethod - def name(cls): - return "msgraph" - - def schema(self): - logger.info("Schema not provided, infering from the source.") - _, schema = get_resource_schema(self.connector_options) - logger.debug(f"Infered schema : {schema}") - return schema - - def reader(self, schema: StructType): - return MSGraphDataSourceReader(self.connector_options, schema) - - -class MSGraphDataSourceReader(DataSourceReader): - - def __init__(self, options: ConnectorOptions, schema: Union[StructType, str]): - self.schema: StructType = schema - self.options:ConnectorOptions = options - - def read(self, partition): - from source_msgraph.utils import to_json - from pyspark.sql import Row - for row in iter_records(self.options): - row = to_json(row) - row_data = {f.name: row.get(f.name, None) for f in self.schema.fields} - yield Row(**row_data) From d25e920bc344771f2cd47dec50a28f141ee0e51f Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 11:29:21 +0000 Subject: [PATCH 02/10] add extras support --- poetry.lock | 2 +- pyproject.toml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 6d15189..74e07f5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3012,4 +3012,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.12,<4" -content-hash = "55fe3e2bccd32c0c86b24ee1f86e76a6137c76af323fb7d50cc53b9b5d5ca1f3" +content-hash = "0cdc9d351347552e7a8e246e9d9663b1e1c11fa0b87a86cdafdcecfeff21fb83" diff --git a/pyproject.toml b/pyproject.toml index b497866..cc94c72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,8 @@ dependencies = [ [tool.poetry] packages = [{include = "source_msgraph", from = "src"}] +[tool.poetry.extras] +list_items= [] [tool.poetry.group.dev.dependencies] pytest = "^8.3.4" From 0b76852ffc1e014ea25c239c9f0ba7c5cf88992b Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 11:29:51 +0000 Subject: [PATCH 03/10] add support for extras --- src/source_msgraph/core/models.py | 5 +++++ src/source_msgraph/core/resource_provider.py | 14 ++++++-------- src/source_msgraph/core/source.py | 4 ++-- src/source_msgraph/resources/__init__.py | 1 + src/source_msgraph/resources/list_items.py | 1 + 5 files changed, 15 insertions(+), 10 deletions(-) create mode 100644 src/source_msgraph/resources/__init__.py diff --git a/src/source_msgraph/core/models.py b/src/source_msgraph/core/models.py index 964dc7e..2ab1145 100644 --- a/src/source_msgraph/core/models.py +++ b/src/source_msgraph/core/models.py @@ -1,6 +1,7 @@ from dataclasses import dataclass import importlib import inspect +import logging import re from typing import Any, Dict from source_msgraph.core.constants import MSGRAPH_SDK_PACKAGE @@ -134,6 +135,8 @@ def map_options_to_params(self, options: Dict[str, Any]) -> 'BaseResource': raise ValueError(f"Missing required resource parameters: {', '.join(missing_params)}") # TODO: add max $top value validation. + if int(options.get("top", 1)) <= 100: + logging.warning("Setting a low `top` value in Microsoft Graph queries can cause high latency and increase throttling risk.") mapped_query_params = {"%24"+k: v for k, v in options.items() if k in self.query_params} mapped_resource_params = {k.replace("-", "%2D"): v for k, v in options.items() if k in self.resource_params} @@ -147,6 +150,8 @@ def map_options_to_params(self, options: Dict[str, Any]) -> 'BaseResource': self.resource_params = mapped_resource_params return self + + GUID_PATTERN = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") diff --git a/src/source_msgraph/core/resource_provider.py b/src/source_msgraph/core/resource_provider.py index 460d156..5f2a9d0 100644 --- a/src/source_msgraph/core/resource_provider.py +++ b/src/source_msgraph/core/resource_provider.py @@ -4,7 +4,7 @@ from typing import Dict, Type from source_msgraph.core.base_client import BaseResourceProvider - +# @lru_cache(maxsize=10) def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: """ Dynamically loads all resource providers from the resources package @@ -15,13 +15,10 @@ def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: # Import the resources package resources_pkg = importlib.import_module(package) - # Iterate through all submodules for _, name, _ in pkgutil.iter_modules(resources_pkg.__path__): if name != 'base': # Skip the base module try: - # Import the module module = importlib.import_module(f'{package}.{name}') - # Look for *ResourceProvider class for attr_name in dir(module): if attr_name.endswith('ResourceProvider'): provider_class = getattr(module, attr_name) @@ -32,13 +29,14 @@ def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: except ImportError as e: print(f"Warning: Could not load resource provider {name}: {e}") - return providers + return frozenset(providers.items()) -def get_resource_provider(resource_name: str, options: Dict[str, str]) -> BaseResourceProvider: +# @lru_cache(maxsize=10) +def get_resource_provider(resource_name: str, options: frozenset) -> BaseResourceProvider: """ Factory method to get the appropriate resource provider """ - providers = load_resource_providers() + providers = dict(load_resource_providers()) provider_class: BaseResourceProvider = providers.get(resource_name) if not provider_class: @@ -47,4 +45,4 @@ def get_resource_provider(resource_name: str, options: Dict[str, str]) -> BaseRe f"Unsupported resource name: '{resource_name}'. " f"Available resources: {available}" ) - return provider_class(options) \ No newline at end of file + return provider_class(dict(options)) \ No newline at end of file diff --git a/src/source_msgraph/core/source.py b/src/source_msgraph/core/source.py index 1a79af3..b23466a 100644 --- a/src/source_msgraph/core/source.py +++ b/src/source_msgraph/core/source.py @@ -19,7 +19,7 @@ def __init__(self, options: Dict[str, Any]): self.resource_name = options.pop("resource") if not self.resource_name: raise ValueError("resource is missing, please provide a valid resource name.") - self.options = options + self.options = frozenset(options.items()) @classmethod def name(cls): @@ -38,7 +38,7 @@ def reader(self, schema: StructType): class MSGraphDataSourceReader(DataSourceReader): - def __init__(self, resource_name :str, options: Dict[str, Any], schema: Union[StructType, str]): + def __init__(self, resource_name :str, options: frozenset, schema: Union[StructType, str]): self.schema: StructType = schema self.options = options self.resource_name = resource_name diff --git a/src/source_msgraph/resources/__init__.py b/src/source_msgraph/resources/__init__.py new file mode 100644 index 0000000..5ee7122 --- /dev/null +++ b/src/source_msgraph/resources/__init__.py @@ -0,0 +1 @@ +from .list_items import * # type: ignore \ No newline at end of file diff --git a/src/source_msgraph/resources/list_items.py b/src/source_msgraph/resources/list_items.py index 6302d53..b9927d5 100644 --- a/src/source_msgraph/resources/list_items.py +++ b/src/source_msgraph/resources/list_items.py @@ -1,4 +1,5 @@ from functools import cached_property +import logging from typing import Dict from source_msgraph.core.base_client import BaseResourceProvider From 8a8059acde676e63b5d5389059a13764a121d0e4 Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 12:36:24 +0000 Subject: [PATCH 04/10] change package name --- pyproject.toml | 8 +- src/source_msgraph/__init__.py | 0 src/source_msgraph/core/async_interator.py | 69 -------- src/source_msgraph/core/base_client.py | 97 ---------- src/source_msgraph/core/constants.py | 2 - src/source_msgraph/core/models.py | 177 ------------------- src/source_msgraph/core/resource_provider.py | 48 ----- src/source_msgraph/core/source.py | 53 ------ src/source_msgraph/core/utils.py | 91 ---------- src/source_msgraph/resources/__init__.py | 1 - src/source_msgraph/resources/list_items.py | 23 --- tests/test_source.py | 2 +- 12 files changed, 6 insertions(+), 565 deletions(-) delete mode 100644 src/source_msgraph/__init__.py delete mode 100644 src/source_msgraph/core/async_interator.py delete mode 100644 src/source_msgraph/core/base_client.py delete mode 100644 src/source_msgraph/core/constants.py delete mode 100644 src/source_msgraph/core/models.py delete mode 100644 src/source_msgraph/core/resource_provider.py delete mode 100644 src/source_msgraph/core/source.py delete mode 100644 src/source_msgraph/core/utils.py delete mode 100644 src/source_msgraph/resources/__init__.py delete mode 100644 src/source_msgraph/resources/list_items.py diff --git a/pyproject.toml b/pyproject.toml index cc94c72..0838f00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,13 @@ [project] -name = "source-msgraph" +name = "pyspark_msgraph_source" version = "0.1.0" -description = "" +description = "Pyspark custom data source for Microsoft Graph APIs, including path and query parameters, with PySpark read examples." authors = [ {name = "geekwhocodes",email = "ganeshraskar@outlook.com"} ] readme = "README.md" +homepage = "https://github.com/geekwhocodes/pyspark-msgraph-source" +repository = "https://github.com/geekwhocodes/pyspark-msgraph-source" requires-python = ">=3.12,<4" dependencies = [ "pyspark (==4.0.0.dev2)", @@ -15,7 +17,7 @@ dependencies = [ ] [tool.poetry] -packages = [{include = "source_msgraph", from = "src"}] +packages = [{include = "pyspark_msgraph_source", from = "src"}] [tool.poetry.extras] list_items= [] diff --git a/src/source_msgraph/__init__.py b/src/source_msgraph/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/source_msgraph/core/async_interator.py b/src/source_msgraph/core/async_interator.py deleted file mode 100644 index b2c121a..0000000 --- a/src/source_msgraph/core/async_interator.py +++ /dev/null @@ -1,69 +0,0 @@ -import asyncio - -import asyncio -from typing import AsyncGenerator, Iterator, Any - -import asyncio -from typing import AsyncGenerator, Iterator, Any - -class AsyncToSyncIterator: - """ - Converts an async generator into a synchronous iterator while ensuring proper event loop handling. - """ - - def __init__(self, async_gen: AsyncGenerator[Any, None]): - """ - Initializes the iterator by consuming an async generator synchronously. - - Args: - async_gen (AsyncGenerator): The async generator yielding results. - """ - self.async_gen = async_gen - self.iterator = self._to_iterator() - - def _to_iterator(self) -> Iterator: - """ - Ensures that the async generator is consumed using the correct event loop. - Uses streaming (does not load all results into memory). - """ - try: - loop = asyncio.get_running_loop() - return self._sync_generator(loop) # Works inside Jupyter - except RuntimeError: - return iter(asyncio.run(self._collect_results())) # Works in scripts - - def _sync_generator(self, loop: asyncio.AbstractEventLoop) -> Iterator: - """ - Streams async results into a sync generator while inside a running event loop. - """ - queue = asyncio.Queue() - - async def _producer(): - """Fills the queue with async results.""" - async for item in self.async_gen: - await queue.put(item) - await queue.put(None) # Sentinel to signal completion - - async def _consumer(): - """Yields items from the queue in sync mode.""" - task = loop.create_task(_producer()) - while True: - item = await queue.get() - if item is None: - break - yield item - await task # Ensure producer task completes - - return iter(loop.run_until_complete(self._collect_results())) - - async def _collect_results(self): - """Collects async generator results into a list (safe for asyncio.run).""" - return [item async for item in self.async_gen] - - def __iter__(self) -> Iterator: - """Returns the synchronous iterator.""" - return self.iterator - - def __next__(self) -> Any: - """Returns the next item from the iterator.""" - return next(self.iterator) \ No newline at end of file diff --git a/src/source_msgraph/core/base_client.py b/src/source_msgraph/core/base_client.py deleted file mode 100644 index 6fceb1e..0000000 --- a/src/source_msgraph/core/base_client.py +++ /dev/null @@ -1,97 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Any, Dict -from msgraph import GraphServiceClient -from kiota_abstractions.base_request_configuration import RequestConfiguration -from msgraph.generated.models.o_data_errors.o_data_error import ODataError -from source_msgraph.core.async_interator import AsyncToSyncIterator -from source_msgraph.core.models import BaseResource -from source_msgraph.core.utils import get_python_schema, to_json, to_pyspark_schema - -from azure.identity import DefaultAzureCredential, EnvironmentCredential - -class BaseResourceProvider(ABC): - def __init__(self, options: Dict[str, Any]): - """ - Initializes the fetcher with the Graph client, resource path, and query parameters. - - :param options: Connector options. - """ - self.options = options - credentials = DefaultAzureCredential() - self.graph_client = GraphServiceClient(credentials=credentials) - - async def fetch_data(self): - """ - Fetches data from Microsoft Graph using the dynamically built request. - Handles pagination automatically. - """ - query_parameters_cls = self.resource.get_query_parameters_cls() - - if query_parameters_cls: - try: - query_parameters_instance = query_parameters_cls() # Ensure it can be instantiated without arguments - except TypeError as e: - raise ValueError(f"Failed to instantiate {query_parameters_cls.__name__}: {e}") - - if self.resource.query_params: - for k, v in self.resource.query_params.items(): - k = k.removeprefix("%24") - if hasattr(query_parameters_instance, k): - setattr(query_parameters_instance, k, v) # Set attributes dynamically - else: - raise AttributeError(f"{query_parameters_cls.__name__} has no attribute '{k}'") - - request_configuration = RequestConfiguration( - query_parameters=query_parameters_instance - ) - - try: - builder = self.resource.get_request_builder_cls()(self.graph_client.request_adapter, self.resource.resource_params) - items = await builder.get(request_configuration=request_configuration) - while True: - for item in items.value: - yield item - if not items.odata_next_link: - break - items = await builder.with_url(items.odata_next_link).get() - - except ODataError as e: - raise Exception(f"Graph API Error: {e.error.message}") - - def iter_records(self): - """ - Iterates over records from the Microsoft Graph API. - - :param options: Connector options containing authentication credentials and resource details. - :return: A synchronous iterator over the fetched data. - :raises ValueError: If any required credentials or resource parameters are missing. - :raises GraphAPIError: If the API request fails. - """ - async_gen = self.fetch_data() - return AsyncToSyncIterator(async_gen) - - def get_resource_schema(self) -> Dict[str, Any]: - """ - Retrieves the schema of a Microsoft Graph API resource by fetching a single record. - - :param options: Connector options containing authentication credentials and resource details. - :return: A dictionary representing the schema of the resource. - :raises ValueError: If no records are found or if required options are missing. - :raises GraphAPIError: If the API request fails. - """ - async_gen = self.fetch_data() - - try: - record = next(AsyncToSyncIterator(async_gen), None) - if not record: - raise ValueError(f"No records found for resource: {self.resource.resource_name}") - record = to_json(record) - schema = to_pyspark_schema(get_python_schema(record)) - return record, schema - - except StopIteration: - raise ValueError(f"No records available for {self.resource.resource_name}") - - @abstractmethod - def resource(self) -> BaseResource: - ... \ No newline at end of file diff --git a/src/source_msgraph/core/constants.py b/src/source_msgraph/core/constants.py deleted file mode 100644 index 6a42334..0000000 --- a/src/source_msgraph/core/constants.py +++ /dev/null @@ -1,2 +0,0 @@ -# Base generated package for Microsoft Graph SDK -MSGRAPH_SDK_PACKAGE = "msgraph.generated" diff --git a/src/source_msgraph/core/models.py b/src/source_msgraph/core/models.py deleted file mode 100644 index 2ab1145..0000000 --- a/src/source_msgraph/core/models.py +++ /dev/null @@ -1,177 +0,0 @@ -from dataclasses import dataclass -import importlib -import inspect -import logging -import re -from typing import Any, Dict -from source_msgraph.core.constants import MSGRAPH_SDK_PACKAGE -from urllib.parse import unquote -from kiota_abstractions.base_request_builder import BaseRequestBuilder - -@dataclass -class BaseResource: - name: str # User friendly name for Spark reader - resource_name: str # Microsoft Graph leaf resource name - request_builder_module: str - query_params: Dict[str, Any] = None - resource_params: Dict[str, Any] = None - request_builder_cls_name: str = None - request_builder_query_cls_name: str = None - - def __post_init__(self): - if not self.name: - raise ValueError("name is required") - - self.request_builder_cls_name = self._pascal_case(f"{self.resource_name}_request_builder") - #self.request_builder_cls = self.get_request_builder_cls() - self.request_builder_query_cls_name = self._pascal_case(f"{self.resource_name}_request_builder_get_query_parameters") - #self.query_parameters_cls = self.get_query_parameters_cls() - self.parse_url_template() - - - @classmethod - def _pascal_case(cls, snake_str: str) -> str: - """ - Converts snake_case to PascalCase. - Example: "items_request_builder" -> "ItemsRequestBuilder" - """ - return "".join(word.title() for word in snake_str.split("_")) - - def get_query_parameters_cls(self): - """ - Retrieves the query parameters class from the request builder module. - """ - try: - module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") - request_builder_cls = getattr(module, self.request_builder_cls_name, None) - - if not request_builder_cls or not issubclass(request_builder_cls, BaseRequestBuilder): - raise AttributeError(f"{self.request_builder_cls_name} not found in {module.__name__}") - - # Inspect the attributes to find the query parameters class - - for attr in dir(request_builder_cls): - if attr == self.request_builder_query_cls_name: - return getattr(request_builder_cls, attr) - raise AttributeError(f"{self.request_builder_query_cls_name} not found in {module.__name__}") - - except ModuleNotFoundError: - raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") - - def get_request_builder_cls(self) -> BaseRequestBuilder: - """ - Dynamically imports a module and finds the RequestBuilder class. - """ - try: - module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") - for attr in dir(module): - if attr == self.request_builder_cls_name: - cls = getattr(module, attr) - if not issubclass(cls, BaseRequestBuilder): - raise AttributeError(f"{attr} is not a subclass of BaseRequestBuilder") - return cls - except ImportError: - raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") - - def get_request_builder_url_template(self): - """ - Extracts the `url_template` by analyzing the source code of the class. - """ - try: - cls = self.get_request_builder_cls() - if inspect.isclass(cls) and hasattr(cls, "__init__"): - # Extract the __init__ function source code - init_source = inspect.getsource(cls.__init__) - if "super().__init__(" in init_source: - lines = init_source.split("\n") - for line in lines: - if "super().__init__(" in line: - match = re.search(r'super\(\).__init__\s*\([^,]+,\s*"([^"]+)"', line) - if match: - url_template = match.group(1).replace('"', "").replace("'", "") - return url_template - - except TypeError: - raise TypeError(f"Error extracting URL template from {cls.__name__}") - - def parse_url_template(self): - """ - Parses the `url_template` string to extract path parameters and query parameters. - """ - url_template = self.get_request_builder_url_template() - if not url_template: - raise ValueError("URL template not found in request builder class") - - # Extract path parameters (decode %2Did → _id) - path_parameters = [ - unquote(match.group(1)).replace("%2D", "_") - for match in re.finditer(r"\{([^?}]+)\}", url_template) - if match.group(1).lower() != "+baseurl" - ] - - # Extract query parameters (decode %24expand → $expand) - query_match = re.search(r"\{\?([^}]+)\}", url_template) - query_parameters = ( - [unquote(q).replace("%24", "$") for q in query_match.group(1).split(",")] - if query_match else [] - ) - - self.resource_params = {k:None for k in path_parameters} - self.query_params = {qp.strip().replace("$", ""): None for qp in query_parameters} - - - def map_options_to_params(self, options: Dict[str, Any]) -> 'BaseResource': - """ - Maps the provided options to either query parameters or resource parameters. - - :param options: Dictionary of options provided by the user. - :param query_params: List of valid query parameter names. - :param resource_params: List of valid resource parameter names. - :return: A tuple (mapped_query_params, mapped_resource_params, invalid_params) - """ - missing_params = [param for param in self.resource_params if param not in options] - - if missing_params: - raise ValueError(f"Missing required resource parameters: {', '.join(missing_params)}") - - # TODO: add max $top value validation. - if int(options.get("top", 1)) <= 100: - logging.warning("Setting a low `top` value in Microsoft Graph queries can cause high latency and increase throttling risk.") - - mapped_query_params = {"%24"+k: v for k, v in options.items() if k in self.query_params} - mapped_resource_params = {k.replace("-", "%2D"): v for k, v in options.items() if k in self.resource_params} - - invalid_params = {k: v for k, v in options.items() if k not in self.query_params and k not in self.resource_params} - - if len(invalid_params) > 0: - raise ValueError(f"Extra parameters {invalid_params} not allowed.") - - self.query_params = mapped_query_params - self.resource_params = mapped_resource_params - - return self - - - -GUID_PATTERN = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") - - -@dataclass -class ConnectorOptions: - """Options for Microsoft Graph API requests with strict resource_path validation.""" - tenant_id: str - client_id: str - client_secret: str - def __post_init__(self): - ... - - def _validate_credentials(self): - """Validates the format and presence of credentials.""" - if not self.tenant_id or not GUID_PATTERN.match(self.tenant_id): - raise ValueError("Invalid tenant_id: must be a valid GUID.") - - if not self.client_id or not GUID_PATTERN.match(self.client_id): - raise ValueError("Invalid client_id: must be a valid GUID.") - - if not self.client_secret or not isinstance(self.client_secret, str): - raise ValueError("Invalid client_secret: must be a non-empty string.") \ No newline at end of file diff --git a/src/source_msgraph/core/resource_provider.py b/src/source_msgraph/core/resource_provider.py deleted file mode 100644 index 5f2a9d0..0000000 --- a/src/source_msgraph/core/resource_provider.py +++ /dev/null @@ -1,48 +0,0 @@ -from functools import lru_cache -import importlib -import pkgutil -from typing import Dict, Type -from source_msgraph.core.base_client import BaseResourceProvider - -# @lru_cache(maxsize=10) -def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: - """ - Dynamically loads all resource providers from the resources package - """ - providers = {} - package = 'source_msgraph.resources' - - # Import the resources package - resources_pkg = importlib.import_module(package) - - for _, name, _ in pkgutil.iter_modules(resources_pkg.__path__): - if name != 'base': # Skip the base module - try: - module = importlib.import_module(f'{package}.{name}') - for attr_name in dir(module): - if attr_name.endswith('ResourceProvider'): - provider_class = getattr(module, attr_name) - if (isinstance(provider_class, type) and - issubclass(provider_class, BaseResourceProvider) and - provider_class != BaseResourceProvider): - providers[name] = provider_class - except ImportError as e: - print(f"Warning: Could not load resource provider {name}: {e}") - - return frozenset(providers.items()) - -# @lru_cache(maxsize=10) -def get_resource_provider(resource_name: str, options: frozenset) -> BaseResourceProvider: - """ - Factory method to get the appropriate resource provider - """ - providers = dict(load_resource_providers()) - provider_class: BaseResourceProvider = providers.get(resource_name) - - if not provider_class: - available = ', '.join(providers.keys()) - raise ValueError( - f"Unsupported resource name: '{resource_name}'. " - f"Available resources: {available}" - ) - return provider_class(dict(options)) \ No newline at end of file diff --git a/src/source_msgraph/core/source.py b/src/source_msgraph/core/source.py deleted file mode 100644 index b23466a..0000000 --- a/src/source_msgraph/core/source.py +++ /dev/null @@ -1,53 +0,0 @@ -import logging -from typing import Any, Dict, Union -from pyspark.sql.datasource import DataSource, DataSourceReader -from pyspark.sql.types import StructType -from source_msgraph.core.base_client import BaseResourceProvider - -from source_msgraph.core.resource_provider import get_resource_provider - -# Reference https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources - -logger = logging.getLogger(__name__) - -class MSGraphDataSource(DataSource): - """ - - """ - def __init__(self, options: Dict[str, Any]): - - self.resource_name = options.pop("resource") - if not self.resource_name: - raise ValueError("resource is missing, please provide a valid resource name.") - self.options = frozenset(options.items()) - - @classmethod - def name(cls): - return "msgraph" - - def schema(self): - logger.info("Schema not provided, infering from the source.") - resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) - _, schema = resource_provider.get_resource_schema() - logger.debug(f"Infered schema : {schema}") - return schema - - def reader(self, schema: StructType): - return MSGraphDataSourceReader(self.resource_name, self.options, schema) - - -class MSGraphDataSourceReader(DataSourceReader): - - def __init__(self, resource_name :str, options: frozenset, schema: Union[StructType, str]): - self.schema: StructType = schema - self.options = options - self.resource_name = resource_name - - def read(self, partition): - from source_msgraph.core.utils import to_json - from pyspark.sql import Row - resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) - for row in resource_provider.iter_records(): - row = to_json(row) - row_data = {f.name: row.get(f.name, None) for f in self.schema.fields} - yield Row(**row_data) diff --git a/src/source_msgraph/core/utils.py b/src/source_msgraph/core/utils.py deleted file mode 100644 index b878c2a..0000000 --- a/src/source_msgraph/core/utils.py +++ /dev/null @@ -1,91 +0,0 @@ -from typing import Any -from kiota_serialization_json.json_serialization_writer_factory import JsonSerializationWriterFactory -import json - -from pyspark.sql.types import ( - StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, - MapType, ArrayType, TimestampType, DateType, LongType, BinaryType, DecimalType -) - -from datetime import datetime, date -from decimal import Decimal - -# Convert to JSON using Kiota -writer_factory = JsonSerializationWriterFactory() -writer = writer_factory.get_serialization_writer("application/json") - -def to_json(value): - value.serialize(writer) - # Get JSON string - return json.loads((writer.get_serialized_content().decode("utf-8"))) - -def to_jsonValue(value): - value.serialize(writer) - # Get JSON string - return str(json.loads((writer.get_serialized_content().decode("utf-8")))) - - - -def get_python_schema(obj:Any): - """ - Recursively extracts the schema from a Python object. - - :param obj: The Python object (dict, list, int, str, etc.). - :return: A schema dictionary representing field types. - """ - if isinstance(obj, bool): - return "bool" - elif isinstance(obj, dict): - return {key: get_python_schema(value) for key, value in obj.items()} - elif isinstance(obj, list): - if obj: # Assume first element type (homogeneous lists) - return [get_python_schema(obj[0])] - return ["any"] # Empty lists default to "any" - elif isinstance(obj, str): - return "str" - elif isinstance(obj, int): - return "int" - elif isinstance(obj, float): - return "float" - elif isinstance(obj, datetime): - return "datetime" - elif isinstance(obj, date): - return "date" - elif isinstance(obj, Decimal): - return "decimal" - elif obj is None: - return "null" - return "unknown" # Fallback for unrecognized types - -def to_pyspark_schema(schema_dict): - """ - Recursively converts a nested Python schema dictionary to a PySpark StructType schema. - - :param schema_dict: Dictionary with field names as keys and data types as values. - :return: PySpark StructType schema. - """ - type_mapping = { - "str": StringType(), - "int": IntegerType(), - "float": DoubleType(), - "bool": BooleanType(), - "datetime": TimestampType(), - "date": DateType(), - "long": LongType(), - "binary": BinaryType(), - "decimal": DecimalType(38, 18), - "unknown": StringType() - } - - def convert_type(value): - """Recursively converts types, handling nested dicts and lists.""" - if isinstance(value, dict): # Nested structure - return StructType([StructField(k, convert_type(v), True) for k, v in value.items()]) - elif isinstance(value, list): # List of elements (assume first element type) - if not value: - return ArrayType(StringType()) # Default to list of strings if empty - return ArrayType(convert_type(value[0])) - return type_mapping.get(value, StringType()) # Default to StringType - - struct_fields = [StructField(field, convert_type(dtype), True) for field, dtype in schema_dict.items()] - return StructType(struct_fields) \ No newline at end of file diff --git a/src/source_msgraph/resources/__init__.py b/src/source_msgraph/resources/__init__.py deleted file mode 100644 index 5ee7122..0000000 --- a/src/source_msgraph/resources/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .list_items import * # type: ignore \ No newline at end of file diff --git a/src/source_msgraph/resources/list_items.py b/src/source_msgraph/resources/list_items.py deleted file mode 100644 index b9927d5..0000000 --- a/src/source_msgraph/resources/list_items.py +++ /dev/null @@ -1,23 +0,0 @@ -from functools import cached_property -import logging -from typing import Dict - -from source_msgraph.core.base_client import BaseResourceProvider -from source_msgraph.core.models import BaseResource - - -class ListItemsResourceProvider(BaseResourceProvider): - - def __init__(self, options: Dict[str, str]): - self.options = options - super().__init__(options) - - @cached_property - def resource(self) -> BaseResource: - return BaseResource( - name="list_items", - resource_name="items", - request_builder_module="sites.item.lists.item.items.items_request_builder" - ).map_options_to_params(self.options) - - diff --git a/tests/test_source.py b/tests/test_source.py index 17cd168..777690e 100644 --- a/tests/test_source.py +++ b/tests/test_source.py @@ -1,6 +1,6 @@ import pytest from pyspark.sql import SparkSession -from source_msgraph.source import FakeDataSource +from source_pyspark_msgraph.source import FakeDataSource # @pytest.fixture From 0df8c829fb8e3bf27c318d5871591a1f4b75cea1 Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 12:36:35 +0000 Subject: [PATCH 05/10] change package name --- src/pyspark_msgraph_source/__init__.py | 0 .../core/async_interator.py | 69 +++++++ .../core/base_client.py | 97 ++++++++++ src/pyspark_msgraph_source/core/constants.py | 2 + src/pyspark_msgraph_source/core/models.py | 177 ++++++++++++++++++ .../core/resource_provider.py | 51 +++++ src/pyspark_msgraph_source/core/source.py | 53 ++++++ src/pyspark_msgraph_source/core/utils.py | 91 +++++++++ .../resources/__init__.py | 1 + .../resources/list_items.py | 23 +++ 10 files changed, 564 insertions(+) create mode 100644 src/pyspark_msgraph_source/__init__.py create mode 100644 src/pyspark_msgraph_source/core/async_interator.py create mode 100644 src/pyspark_msgraph_source/core/base_client.py create mode 100644 src/pyspark_msgraph_source/core/constants.py create mode 100644 src/pyspark_msgraph_source/core/models.py create mode 100644 src/pyspark_msgraph_source/core/resource_provider.py create mode 100644 src/pyspark_msgraph_source/core/source.py create mode 100644 src/pyspark_msgraph_source/core/utils.py create mode 100644 src/pyspark_msgraph_source/resources/__init__.py create mode 100644 src/pyspark_msgraph_source/resources/list_items.py diff --git a/src/pyspark_msgraph_source/__init__.py b/src/pyspark_msgraph_source/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyspark_msgraph_source/core/async_interator.py b/src/pyspark_msgraph_source/core/async_interator.py new file mode 100644 index 0000000..b2c121a --- /dev/null +++ b/src/pyspark_msgraph_source/core/async_interator.py @@ -0,0 +1,69 @@ +import asyncio + +import asyncio +from typing import AsyncGenerator, Iterator, Any + +import asyncio +from typing import AsyncGenerator, Iterator, Any + +class AsyncToSyncIterator: + """ + Converts an async generator into a synchronous iterator while ensuring proper event loop handling. + """ + + def __init__(self, async_gen: AsyncGenerator[Any, None]): + """ + Initializes the iterator by consuming an async generator synchronously. + + Args: + async_gen (AsyncGenerator): The async generator yielding results. + """ + self.async_gen = async_gen + self.iterator = self._to_iterator() + + def _to_iterator(self) -> Iterator: + """ + Ensures that the async generator is consumed using the correct event loop. + Uses streaming (does not load all results into memory). + """ + try: + loop = asyncio.get_running_loop() + return self._sync_generator(loop) # Works inside Jupyter + except RuntimeError: + return iter(asyncio.run(self._collect_results())) # Works in scripts + + def _sync_generator(self, loop: asyncio.AbstractEventLoop) -> Iterator: + """ + Streams async results into a sync generator while inside a running event loop. + """ + queue = asyncio.Queue() + + async def _producer(): + """Fills the queue with async results.""" + async for item in self.async_gen: + await queue.put(item) + await queue.put(None) # Sentinel to signal completion + + async def _consumer(): + """Yields items from the queue in sync mode.""" + task = loop.create_task(_producer()) + while True: + item = await queue.get() + if item is None: + break + yield item + await task # Ensure producer task completes + + return iter(loop.run_until_complete(self._collect_results())) + + async def _collect_results(self): + """Collects async generator results into a list (safe for asyncio.run).""" + return [item async for item in self.async_gen] + + def __iter__(self) -> Iterator: + """Returns the synchronous iterator.""" + return self.iterator + + def __next__(self) -> Any: + """Returns the next item from the iterator.""" + return next(self.iterator) \ No newline at end of file diff --git a/src/pyspark_msgraph_source/core/base_client.py b/src/pyspark_msgraph_source/core/base_client.py new file mode 100644 index 0000000..56bc709 --- /dev/null +++ b/src/pyspark_msgraph_source/core/base_client.py @@ -0,0 +1,97 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict +from msgraph import GraphServiceClient +from kiota_abstractions.base_request_configuration import RequestConfiguration +from msgraph.generated.models.o_data_errors.o_data_error import ODataError +from pyspark_msgraph_source.core.async_interator import AsyncToSyncIterator +from pyspark_msgraph_source.core.models import BaseResource +from pyspark_msgraph_source.core.utils import get_python_schema, to_json, to_pyspark_schema + +from azure.identity import DefaultAzureCredential, EnvironmentCredential + +class BaseResourceProvider(ABC): + def __init__(self, options: Dict[str, Any]): + """ + Initializes the fetcher with the Graph client, resource path, and query parameters. + + :param options: Connector options. + """ + self.options = options + credentials = DefaultAzureCredential() + self.graph_client = GraphServiceClient(credentials=credentials) + + async def fetch_data(self): + """ + Fetches data from Microsoft Graph using the dynamically built request. + Handles pagination automatically. + """ + query_parameters_cls = self.resource.get_query_parameters_cls() + + if query_parameters_cls: + try: + query_parameters_instance = query_parameters_cls() # Ensure it can be instantiated without arguments + except TypeError as e: + raise ValueError(f"Failed to instantiate {query_parameters_cls.__name__}: {e}") + + if self.resource.query_params: + for k, v in self.resource.query_params.items(): + k = k.removeprefix("%24") + if hasattr(query_parameters_instance, k): + setattr(query_parameters_instance, k, v) # Set attributes dynamically + else: + raise AttributeError(f"{query_parameters_cls.__name__} has no attribute '{k}'") + + request_configuration = RequestConfiguration( + query_parameters=query_parameters_instance + ) + + try: + builder = self.resource.get_request_builder_cls()(self.graph_client.request_adapter, self.resource.resource_params) + items = await builder.get(request_configuration=request_configuration) + while True: + for item in items.value: + yield item + if not items.odata_next_link: + break + items = await builder.with_url(items.odata_next_link).get() + + except ODataError as e: + raise Exception(f"Graph API Error: {e.error.message}") + + def iter_records(self): + """ + Iterates over records from the Microsoft Graph API. + + :param options: Connector options containing authentication credentials and resource details. + :return: A synchronous iterator over the fetched data. + :raises ValueError: If any required credentials or resource parameters are missing. + :raises GraphAPIError: If the API request fails. + """ + async_gen = self.fetch_data() + return AsyncToSyncIterator(async_gen) + + def get_resource_schema(self) -> Dict[str, Any]: + """ + Retrieves the schema of a Microsoft Graph API resource by fetching a single record. + + :param options: Connector options containing authentication credentials and resource details. + :return: A dictionary representing the schema of the resource. + :raises ValueError: If no records are found or if required options are missing. + :raises GraphAPIError: If the API request fails. + """ + async_gen = self.fetch_data() + + try: + record = next(AsyncToSyncIterator(async_gen), None) + if not record: + raise ValueError(f"No records found for resource: {self.resource.resource_name}") + record = to_json(record) + schema = to_pyspark_schema(get_python_schema(record)) + return record, schema + + except StopIteration: + raise ValueError(f"No records available for {self.resource.resource_name}") + + @abstractmethod + def resource(self) -> BaseResource: + ... \ No newline at end of file diff --git a/src/pyspark_msgraph_source/core/constants.py b/src/pyspark_msgraph_source/core/constants.py new file mode 100644 index 0000000..6a42334 --- /dev/null +++ b/src/pyspark_msgraph_source/core/constants.py @@ -0,0 +1,2 @@ +# Base generated package for Microsoft Graph SDK +MSGRAPH_SDK_PACKAGE = "msgraph.generated" diff --git a/src/pyspark_msgraph_source/core/models.py b/src/pyspark_msgraph_source/core/models.py new file mode 100644 index 0000000..7651816 --- /dev/null +++ b/src/pyspark_msgraph_source/core/models.py @@ -0,0 +1,177 @@ +from dataclasses import dataclass +import importlib +import inspect +import logging +import re +from typing import Any, Dict +from pyspark_msgraph_source.core.constants import MSGRAPH_SDK_PACKAGE +from urllib.parse import unquote +from kiota_abstractions.base_request_builder import BaseRequestBuilder + +@dataclass +class BaseResource: + name: str # User friendly name for Spark reader + resource_name: str # Microsoft Graph leaf resource name + request_builder_module: str + query_params: Dict[str, Any] = None + resource_params: Dict[str, Any] = None + request_builder_cls_name: str = None + request_builder_query_cls_name: str = None + + def __post_init__(self): + if not self.name: + raise ValueError("name is required") + + self.request_builder_cls_name = self._pascal_case(f"{self.resource_name}_request_builder") + #self.request_builder_cls = self.get_request_builder_cls() + self.request_builder_query_cls_name = self._pascal_case(f"{self.resource_name}_request_builder_get_query_parameters") + #self.query_parameters_cls = self.get_query_parameters_cls() + self.parse_url_template() + + + @classmethod + def _pascal_case(cls, snake_str: str) -> str: + """ + Converts snake_case to PascalCase. + Example: "items_request_builder" -> "ItemsRequestBuilder" + """ + return "".join(word.title() for word in snake_str.split("_")) + + def get_query_parameters_cls(self): + """ + Retrieves the query parameters class from the request builder module. + """ + try: + module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") + request_builder_cls = getattr(module, self.request_builder_cls_name, None) + + if not request_builder_cls or not issubclass(request_builder_cls, BaseRequestBuilder): + raise AttributeError(f"{self.request_builder_cls_name} not found in {module.__name__}") + + # Inspect the attributes to find the query parameters class + + for attr in dir(request_builder_cls): + if attr == self.request_builder_query_cls_name: + return getattr(request_builder_cls, attr) + raise AttributeError(f"{self.request_builder_query_cls_name} not found in {module.__name__}") + + except ModuleNotFoundError: + raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") + + def get_request_builder_cls(self) -> BaseRequestBuilder: + """ + Dynamically imports a module and finds the RequestBuilder class. + """ + try: + module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") + for attr in dir(module): + if attr == self.request_builder_cls_name: + cls = getattr(module, attr) + if not issubclass(cls, BaseRequestBuilder): + raise AttributeError(f"{attr} is not a subclass of BaseRequestBuilder") + return cls + except ImportError: + raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") + + def get_request_builder_url_template(self): + """ + Extracts the `url_template` by analyzing the source code of the class. + """ + try: + cls = self.get_request_builder_cls() + if inspect.isclass(cls) and hasattr(cls, "__init__"): + # Extract the __init__ function source code + init_source = inspect.getsource(cls.__init__) + if "super().__init__(" in init_source: + lines = init_source.split("\n") + for line in lines: + if "super().__init__(" in line: + match = re.search(r'super\(\).__init__\s*\([^,]+,\s*"([^"]+)"', line) + if match: + url_template = match.group(1).replace('"', "").replace("'", "") + return url_template + + except TypeError: + raise TypeError(f"Error extracting URL template from {cls.__name__}") + + def parse_url_template(self): + """ + Parses the `url_template` string to extract path parameters and query parameters. + """ + url_template = self.get_request_builder_url_template() + if not url_template: + raise ValueError("URL template not found in request builder class") + + # Extract path parameters (decode %2Did → _id) + path_parameters = [ + unquote(match.group(1)).replace("%2D", "_") + for match in re.finditer(r"\{([^?}]+)\}", url_template) + if match.group(1).lower() != "+baseurl" + ] + + # Extract query parameters (decode %24expand → $expand) + query_match = re.search(r"\{\?([^}]+)\}", url_template) + query_parameters = ( + [unquote(q).replace("%24", "$") for q in query_match.group(1).split(",")] + if query_match else [] + ) + + self.resource_params = {k:None for k in path_parameters} + self.query_params = {qp.strip().replace("$", ""): None for qp in query_parameters} + + + def map_options_to_params(self, options: Dict[str, Any]) -> 'BaseResource': + """ + Maps the provided options to either query parameters or resource parameters. + + :param options: Dictionary of options provided by the user. + :param query_params: List of valid query parameter names. + :param resource_params: List of valid resource parameter names. + :return: A tuple (mapped_query_params, mapped_resource_params, invalid_params) + """ + missing_params = [param for param in self.resource_params if param not in options] + + if missing_params: + raise ValueError(f"Missing required resource parameters: {', '.join(missing_params)}") + + # TODO: add max $top value validation. + if int(options.get("top", 1)) <= 100: + logging.warning("Setting a low `top` value in Microsoft Graph queries can cause high latency and increase throttling risk.") + + mapped_query_params = {"%24"+k: v for k, v in options.items() if k in self.query_params} + mapped_resource_params = {k.replace("-", "%2D"): v for k, v in options.items() if k in self.resource_params} + + invalid_params = {k: v for k, v in options.items() if k not in self.query_params and k not in self.resource_params} + + if len(invalid_params) > 0: + raise ValueError(f"Extra parameters {invalid_params} not allowed.") + + self.query_params = mapped_query_params + self.resource_params = mapped_resource_params + + return self + + + +GUID_PATTERN = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") + + +@dataclass +class ConnectorOptions: + """Options for Microsoft Graph API requests with strict resource_path validation.""" + tenant_id: str + client_id: str + client_secret: str + def __post_init__(self): + ... + + def _validate_credentials(self): + """Validates the format and presence of credentials.""" + if not self.tenant_id or not GUID_PATTERN.match(self.tenant_id): + raise ValueError("Invalid tenant_id: must be a valid GUID.") + + if not self.client_id or not GUID_PATTERN.match(self.client_id): + raise ValueError("Invalid client_id: must be a valid GUID.") + + if not self.client_secret or not isinstance(self.client_secret, str): + raise ValueError("Invalid client_secret: must be a non-empty string.") \ No newline at end of file diff --git a/src/pyspark_msgraph_source/core/resource_provider.py b/src/pyspark_msgraph_source/core/resource_provider.py new file mode 100644 index 0000000..bc72ac6 --- /dev/null +++ b/src/pyspark_msgraph_source/core/resource_provider.py @@ -0,0 +1,51 @@ +from functools import lru_cache +import importlib +import logging +import pkgutil +from typing import Dict, Type +from pyspark_msgraph_source.core.base_client import BaseResourceProvider + +# @lru_cache(maxsize=10) +def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: + """ + Dynamically loads all resource providers from the resources package + """ + providers = {} + root_package = __package__.split('.')[0] + logging.debug(f"Current root package {root_package}.") + + package = f'{root_package}.resources' + + resources_pkg = importlib.import_module(package) + + for _, name, _ in pkgutil.iter_modules(resources_pkg.__path__): + if name != 'base': # Skip the base module + try: + module = importlib.import_module(f'{package}.{name}') + for attr_name in dir(module): + if attr_name.endswith('ResourceProvider'): + provider_class = getattr(module, attr_name) + if (isinstance(provider_class, type) and + issubclass(provider_class, BaseResourceProvider) and + provider_class != BaseResourceProvider): + providers[name] = provider_class + except ImportError as e: + print(f"Warning: Could not load resource provider {name}: {e}") + + return frozenset(providers.items()) + +# @lru_cache(maxsize=10) +def get_resource_provider(resource_name: str, options: frozenset) -> BaseResourceProvider: + """ + Factory method to get the appropriate resource provider + """ + providers = dict(load_resource_providers()) + provider_class: BaseResourceProvider = providers.get(resource_name) + + if not provider_class: + available = ', '.join(providers.keys()) + raise ValueError( + f"Unsupported resource name: '{resource_name}'. " + f"Available resources: {available}" + ) + return provider_class(dict(options)) \ No newline at end of file diff --git a/src/pyspark_msgraph_source/core/source.py b/src/pyspark_msgraph_source/core/source.py new file mode 100644 index 0000000..efa4660 --- /dev/null +++ b/src/pyspark_msgraph_source/core/source.py @@ -0,0 +1,53 @@ +import logging +from typing import Any, Dict, Union +from pyspark.sql.datasource import DataSource, DataSourceReader +from pyspark.sql.types import StructType +from pyspark_msgraph_source.core.base_client import BaseResourceProvider + +from pyspark_msgraph_source.core.resource_provider import get_resource_provider + +# Reference https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources + +logger = logging.getLogger(__name__) + +class MSGraphDataSource(DataSource): + """ + + """ + def __init__(self, options: Dict[str, Any]): + + self.resource_name = options.pop("resource") + if not self.resource_name: + raise ValueError("resource is missing, please provide a valid resource name.") + self.options = frozenset(options.items()) + + @classmethod + def name(cls): + return "msgraph" + + def schema(self): + logger.info("Schema not provided, infering from the source.") + resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + _, schema = resource_provider.get_resource_schema() + logger.debug(f"Infered schema : {schema}") + return schema + + def reader(self, schema: StructType): + return MSGraphDataSourceReader(self.resource_name, self.options, schema) + + +class MSGraphDataSourceReader(DataSourceReader): + + def __init__(self, resource_name :str, options: frozenset, schema: Union[StructType, str]): + self.schema: StructType = schema + self.options = options + self.resource_name = resource_name + + def read(self, partition): + from pyspark_msgraph_source.core.utils import to_json + from pyspark.sql import Row + resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + for row in resource_provider.iter_records(): + row = to_json(row) + row_data = {f.name: row.get(f.name, None) for f in self.schema.fields} + yield Row(**row_data) diff --git a/src/pyspark_msgraph_source/core/utils.py b/src/pyspark_msgraph_source/core/utils.py new file mode 100644 index 0000000..b878c2a --- /dev/null +++ b/src/pyspark_msgraph_source/core/utils.py @@ -0,0 +1,91 @@ +from typing import Any +from kiota_serialization_json.json_serialization_writer_factory import JsonSerializationWriterFactory +import json + +from pyspark.sql.types import ( + StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, + MapType, ArrayType, TimestampType, DateType, LongType, BinaryType, DecimalType +) + +from datetime import datetime, date +from decimal import Decimal + +# Convert to JSON using Kiota +writer_factory = JsonSerializationWriterFactory() +writer = writer_factory.get_serialization_writer("application/json") + +def to_json(value): + value.serialize(writer) + # Get JSON string + return json.loads((writer.get_serialized_content().decode("utf-8"))) + +def to_jsonValue(value): + value.serialize(writer) + # Get JSON string + return str(json.loads((writer.get_serialized_content().decode("utf-8")))) + + + +def get_python_schema(obj:Any): + """ + Recursively extracts the schema from a Python object. + + :param obj: The Python object (dict, list, int, str, etc.). + :return: A schema dictionary representing field types. + """ + if isinstance(obj, bool): + return "bool" + elif isinstance(obj, dict): + return {key: get_python_schema(value) for key, value in obj.items()} + elif isinstance(obj, list): + if obj: # Assume first element type (homogeneous lists) + return [get_python_schema(obj[0])] + return ["any"] # Empty lists default to "any" + elif isinstance(obj, str): + return "str" + elif isinstance(obj, int): + return "int" + elif isinstance(obj, float): + return "float" + elif isinstance(obj, datetime): + return "datetime" + elif isinstance(obj, date): + return "date" + elif isinstance(obj, Decimal): + return "decimal" + elif obj is None: + return "null" + return "unknown" # Fallback for unrecognized types + +def to_pyspark_schema(schema_dict): + """ + Recursively converts a nested Python schema dictionary to a PySpark StructType schema. + + :param schema_dict: Dictionary with field names as keys and data types as values. + :return: PySpark StructType schema. + """ + type_mapping = { + "str": StringType(), + "int": IntegerType(), + "float": DoubleType(), + "bool": BooleanType(), + "datetime": TimestampType(), + "date": DateType(), + "long": LongType(), + "binary": BinaryType(), + "decimal": DecimalType(38, 18), + "unknown": StringType() + } + + def convert_type(value): + """Recursively converts types, handling nested dicts and lists.""" + if isinstance(value, dict): # Nested structure + return StructType([StructField(k, convert_type(v), True) for k, v in value.items()]) + elif isinstance(value, list): # List of elements (assume first element type) + if not value: + return ArrayType(StringType()) # Default to list of strings if empty + return ArrayType(convert_type(value[0])) + return type_mapping.get(value, StringType()) # Default to StringType + + struct_fields = [StructField(field, convert_type(dtype), True) for field, dtype in schema_dict.items()] + return StructType(struct_fields) \ No newline at end of file diff --git a/src/pyspark_msgraph_source/resources/__init__.py b/src/pyspark_msgraph_source/resources/__init__.py new file mode 100644 index 0000000..5ee7122 --- /dev/null +++ b/src/pyspark_msgraph_source/resources/__init__.py @@ -0,0 +1 @@ +from .list_items import * # type: ignore \ No newline at end of file diff --git a/src/pyspark_msgraph_source/resources/list_items.py b/src/pyspark_msgraph_source/resources/list_items.py new file mode 100644 index 0000000..e3d1293 --- /dev/null +++ b/src/pyspark_msgraph_source/resources/list_items.py @@ -0,0 +1,23 @@ +from functools import cached_property +import logging +from typing import Dict + +from pyspark_msgraph_source.core.base_client import BaseResourceProvider +from pyspark_msgraph_source.core.models import BaseResource + + +class ListItemsResourceProvider(BaseResourceProvider): + + def __init__(self, options: Dict[str, str]): + self.options = options + super().__init__(options) + + @cached_property + def resource(self) -> BaseResource: + return BaseResource( + name="list_items", + resource_name="items", + request_builder_module="sites.item.lists.item.items.items_request_builder" + ).map_options_to_params(self.options) + + From 13bca5db3f700ca82d6f01d8ae9b1541df55620c Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 13:36:07 +0000 Subject: [PATCH 06/10] init docs --- docs/api/core.md | 3 + docs/api/index.md | 14 + docs/index.md | 17 + mkdocs.yml | 18 + poetry.lock | 425 +++++++++++++++++++- pyproject.toml | 3 + src/pyspark_msgraph_source/core/__init__.py | 0 7 files changed, 473 insertions(+), 7 deletions(-) create mode 100644 docs/api/core.md create mode 100644 docs/api/index.md create mode 100644 docs/index.md create mode 100644 mkdocs.yml create mode 100644 src/pyspark_msgraph_source/core/__init__.py diff --git a/docs/api/core.md b/docs/api/core.md new file mode 100644 index 0000000..b5c6a31 --- /dev/null +++ b/docs/api/core.md @@ -0,0 +1,3 @@ +# Core Engine + +::: pyspark_msgraph_source.core.async_interator diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..86888cd --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,14 @@ +# API Reference + +Welcome to the API Reference of `your_package`. + +Below are the available modules and submodules: + +## Core +- [Core Overview](core.md) + +## Utils +- [Utils Helpers](utils.md) + +## API Client +- [API Client](api_client.md) diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..000ea34 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,17 @@ +# Welcome to MkDocs + +For full documentation visit [mkdocs.org](https://www.mkdocs.org). + +## Commands + +* `mkdocs new [dir-name]` - Create a new project. +* `mkdocs serve` - Start the live-reloading docs server. +* `mkdocs build` - Build the documentation site. +* `mkdocs -h` - Print help message and exit. + +## Project layout + + mkdocs.yml # The configuration file. + docs/ + index.md # The documentation homepage. + ... # Other markdown pages, images and other files. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..8109506 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,18 @@ +site_name: Pyspark MSGraph Source +theme: + name: material + +plugins: + - search + - mkdocstrings: + handlers: + python: + paths: ["src/"] # or wherever your package code is + options: + show_source: true + +nav: + - Home: index.md + - API Reference: + - Overview: api/index.md + - Core: api/core.md diff --git a/poetry.lock b/poetry.lock index 74e07f5..77f6987 100644 --- a/poetry.lock +++ b/poetry.lock @@ -240,6 +240,40 @@ msal = ">=1.30.0" msal-extensions = ">=1.2.0" typing-extensions = ">=4.0.0" +[[package]] +name = "babel" +version = "2.17.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, + {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, +] + +[package.extras] +dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""] + +[[package]] +name = "backrefs" +version = "5.8" +description = "A wrapper around re and regex that adds additional back references." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"}, + {file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"}, + {file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"}, + {file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"}, + {file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"}, + {file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"}, +] + +[package.extras] +extras = ["regex"] + [[package]] name = "black" version = "25.1.0" @@ -291,7 +325,7 @@ version = "2025.1.31" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, @@ -396,7 +430,7 @@ version = "3.4.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, @@ -514,7 +548,6 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["dev"] -markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -821,6 +854,24 @@ files = [ {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"}, ] +[[package]] +name = "ghp-import" +version = "2.1.0" +description = "Copy your docs directly to the gh-pages branch." +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, + {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, +] + +[package.dependencies] +python-dateutil = ">=2.8.1" + +[package.extras] +dev = ["flake8", "markdown", "twine", "wheel"] + [[package]] name = "googleapis-common-protos" version = "1.67.0" @@ -839,6 +890,21 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4 [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] +[[package]] +name = "griffe" +version = "1.6.0" +description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"}, + {file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"}, +] + +[package.dependencies] +colorama = ">=0.4" + [[package]] name = "grpcio" version = "1.70.0" @@ -1045,7 +1111,7 @@ version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, @@ -1197,6 +1263,24 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] +[[package]] +name = "jinja2" +version = "3.1.5" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, + {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + [[package]] name = "jupyter-client" version = "8.6.3" @@ -1257,6 +1341,77 @@ files = [ docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] testing = ["coverage", "pyyaml"] +[[package]] +name = "markupsafe" +version = "3.0.2" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, + {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, +] + [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -1284,6 +1439,18 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "mergedeep" +version = "1.3.4" +description = "A deep merge function for 🐍." +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"}, + {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"}, +] + [[package]] name = "microsoft-kiota-abstractions" version = "1.9.2" @@ -1398,6 +1565,157 @@ files = [ [package.dependencies] microsoft-kiota-abstractions = ">=1.9.2,<1.10.0" +[[package]] +name = "mkdocs" +version = "1.6.1" +description = "Project documentation with Markdown." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"}, + {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"}, +] + +[package.dependencies] +click = ">=7.0" +colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} +ghp-import = ">=1.0" +jinja2 = ">=2.11.1" +markdown = ">=3.3.6" +markupsafe = ">=2.0.1" +mergedeep = ">=1.3.4" +mkdocs-get-deps = ">=0.2.0" +packaging = ">=20.5" +pathspec = ">=0.11.1" +pyyaml = ">=5.1" +pyyaml-env-tag = ">=0.1" +watchdog = ">=2.0" + +[package.extras] +i18n = ["babel (>=2.9.0)"] +min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4) ; platform_system == \"Windows\"", "ghp-import (==1.0)", "importlib-metadata (==4.4) ; python_version < \"3.10\"", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"] + +[[package]] +name = "mkdocs-autorefs" +version = "1.4.0" +description = "Automatically link across pages in MkDocs." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "mkdocs_autorefs-1.4.0-py3-none-any.whl", hash = "sha256:bad19f69655878d20194acd0162e29a89c3f7e6365ffe54e72aa3fd1072f240d"}, + {file = "mkdocs_autorefs-1.4.0.tar.gz", hash = "sha256:a9c0aa9c90edbce302c09d050a3c4cb7c76f8b7b2c98f84a7a05f53d00392156"}, +] + +[package.dependencies] +Markdown = ">=3.3" +markupsafe = ">=2.0.1" +mkdocs = ">=1.1" + +[[package]] +name = "mkdocs-get-deps" +version = "0.2.0" +description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"}, + {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"}, +] + +[package.dependencies] +mergedeep = ">=1.3.4" +platformdirs = ">=2.2.0" +pyyaml = ">=5.1" + +[[package]] +name = "mkdocs-material" +version = "9.6.7" +description = "Documentation that simply works" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"}, + {file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"}, +] + +[package.dependencies] +babel = ">=2.10,<3.0" +backrefs = ">=5.7.post1,<6.0" +colorama = ">=0.4,<1.0" +jinja2 = ">=3.0,<4.0" +markdown = ">=3.2,<4.0" +mkdocs = ">=1.6,<2.0" +mkdocs-material-extensions = ">=1.3,<2.0" +paginate = ">=0.5,<1.0" +pygments = ">=2.16,<3.0" +pymdown-extensions = ">=10.2,<11.0" +requests = ">=2.26,<3.0" + +[package.extras] +git = ["mkdocs-git-committers-plugin-2 (>=1.1,<3)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"] +imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"] +recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +description = "Extension pack for Python Markdown and MkDocs Material." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31"}, + {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"}, +] + +[[package]] +name = "mkdocstrings" +version = "0.28.2" +description = "Automatic documentation from sources, for MkDocs." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "mkdocstrings-0.28.2-py3-none-any.whl", hash = "sha256:57f79c557e2718d217d6f6a81bf75a0de097f10e922e7e5e00f085c3f0ff6895"}, + {file = "mkdocstrings-0.28.2.tar.gz", hash = "sha256:9b847266d7a588ea76a8385eaebe1538278b4361c0d1ce48ed005be59f053569"}, +] + +[package.dependencies] +Jinja2 = ">=2.11.1" +Markdown = ">=3.6" +MarkupSafe = ">=1.1" +mkdocs = ">=1.4" +mkdocs-autorefs = ">=1.4" +mkdocs-get-deps = ">=0.2" +mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} +pymdown-extensions = ">=6.3" + +[package.extras] +crystal = ["mkdocstrings-crystal (>=0.3.4)"] +python = ["mkdocstrings-python (>=0.5.2)"] +python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] + +[[package]] +name = "mkdocstrings-python" +version = "1.16.2" +description = "A Python handler for mkdocstrings." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "mkdocstrings_python-1.16.2-py3-none-any.whl", hash = "sha256:ff7e719404e59ad1a72f1afbe854769984c889b8fa043c160f6c988e1ad9e966"}, + {file = "mkdocstrings_python-1.16.2.tar.gz", hash = "sha256:942ec1a2e0481d28f96f93be3d6e343cab92a21e5baf01c37dd2d7236c4d0bd7"}, +] + +[package.dependencies] +griffe = ">=0.49" +mkdocs-autorefs = ">=1.4" +mkdocstrings = ">=0.28.2" + [[package]] name = "msal" version = "1.31.1" @@ -1795,6 +2113,22 @@ files = [ {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] +[[package]] +name = "paginate" +version = "0.5.7" +description = "Divides large result sets into pages for easier browsing" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591"}, + {file = "paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945"}, +] + +[package.extras] +dev = ["pytest", "tox"] +lint = ["black"] + [[package]] name = "pandas" version = "2.2.3" @@ -2330,6 +2664,25 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pymdown-extensions" +version = "10.14.3" +description = "Extension pack for Python Markdown." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pymdown_extensions-10.14.3-py3-none-any.whl", hash = "sha256:05e0bee73d64b9c71a4ae17c72abc2f700e8bc8403755a00580b49a4e9f189e9"}, + {file = "pymdown_extensions-10.14.3.tar.gz", hash = "sha256:41e576ce3f5d650be59e900e4ceff231e0aed2a88cf30acaee41e02f063a061b"}, +] + +[package.dependencies] +markdown = ">=3.6" +pyyaml = "*" + +[package.extras] +extra = ["pygments (>=2.19.1)"] + [[package]] name = "pyspark" version = "4.0.0.dev2" @@ -2491,6 +2844,21 @@ files = [ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +[[package]] +name = "pyyaml-env-tag" +version = "0.1" +description = "A custom YAML tag for referencing environment variables in YAML files. " +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, + {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, +] + +[package.dependencies] +pyyaml = "*" + [[package]] name = "pyzmq" version = "26.2.1" @@ -2619,7 +2987,7 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, @@ -2758,7 +3126,7 @@ version = "2.3.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, @@ -2791,6 +3159,49 @@ platformdirs = ">=3.9.1,<5" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] +[[package]] +name = "watchdog" +version = "6.0.0" +description = "Filesystem events monitoring" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d1cdb490583ebd691c012b3d6dae011000fe42edb7a82ece80965b42abd61f26"}, + {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc64ab3bdb6a04d69d4023b29422170b74681784ffb9463ed4870cf2f3e66112"}, + {file = "watchdog-6.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c897ac1b55c5a1461e16dae288d22bb2e412ba9807df8397a635d88f671d36c3"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e6f0e77c9417e7cd62af82529b10563db3423625c5fce018430b249bf977f9e8"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:90c8e78f3b94014f7aaae121e6b909674df5b46ec24d6bebc45c44c56729af2a"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e7631a77ffb1f7d2eefa4445ebbee491c720a5661ddf6df3498ebecae5ed375c"}, + {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c7ac31a19f4545dd92fc25d200694098f42c9a8e391bc00bdd362c5736dbf881"}, + {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9513f27a1a582d9808cf21a07dae516f0fab1cf2d7683a742c498b93eedabb11"}, + {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7a0e56874cfbc4b9b05c60c8a1926fedf56324bb08cfbc188969777940aef3aa"}, + {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e6439e374fc012255b4ec786ae3c4bc838cd7309a540e5fe0952d03687d8804e"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2"}, + {file = "watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a"}, + {file = "watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680"}, + {file = "watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f"}, + {file = "watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282"}, +] + +[package.extras] +watchmedo = ["PyYAML (>=3.10)"] + [[package]] name = "wcwidth" version = "0.2.13" @@ -3012,4 +3423,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.12,<4" -content-hash = "0cdc9d351347552e7a8e246e9d9663b1e1c11fa0b87a86cdafdcecfeff21fb83" +content-hash = "24c46e7ab41949a8b9dd45260a7c6725f13c1133550548e3010cf1bd30f5a2e6" diff --git a/pyproject.toml b/pyproject.toml index 0838f00..b37169b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,9 @@ grpcio-status = "^1.60.1" pandas = "^2.2.0" ipykernel = "^6.29.5" markdown = "^3.7" +mkdocs = "^1.6.1" +mkdocs-material = "^9.6.7" +mkdocstrings = {extras = ["python"], version = "^0.28.2"} [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] diff --git a/src/pyspark_msgraph_source/core/__init__.py b/src/pyspark_msgraph_source/core/__init__.py new file mode 100644 index 0000000..e69de29 From 446257ada06efedf63949eec4bb6e49f43903c41 Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 14:22:06 +0000 Subject: [PATCH 07/10] add core docs --- docs/api/core.md | 3 - docs/api/core/async-iterator.md | 3 + docs/api/core/client.md | 3 + docs/api/core/models.md | 3 + docs/api/core/resource-provider.md | 3 + docs/api/core/utils.md | 3 + mkdocs.yml | 14 +- .../{async_interator.py => async_iterator.py} | 2 + .../core/base_client.py | 113 +++++++++++--- src/pyspark_msgraph_source/core/models.py | 146 ++++++++++++------ .../core/resource_provider.py | 59 +++++-- src/pyspark_msgraph_source/core/source.py | 107 +++++++++++-- src/pyspark_msgraph_source/core/utils.py | 89 ++++++++--- 13 files changed, 415 insertions(+), 133 deletions(-) delete mode 100644 docs/api/core.md create mode 100644 docs/api/core/async-iterator.md create mode 100644 docs/api/core/client.md create mode 100644 docs/api/core/models.md create mode 100644 docs/api/core/resource-provider.md create mode 100644 docs/api/core/utils.md rename src/pyspark_msgraph_source/core/{async_interator.py => async_iterator.py} (94%) diff --git a/docs/api/core.md b/docs/api/core.md deleted file mode 100644 index b5c6a31..0000000 --- a/docs/api/core.md +++ /dev/null @@ -1,3 +0,0 @@ -# Core Engine - -::: pyspark_msgraph_source.core.async_interator diff --git a/docs/api/core/async-iterator.md b/docs/api/core/async-iterator.md new file mode 100644 index 0000000..d280bce --- /dev/null +++ b/docs/api/core/async-iterator.md @@ -0,0 +1,3 @@ +# Core Engine + +::: pyspark_msgraph_source.core.async_iterator diff --git a/docs/api/core/client.md b/docs/api/core/client.md new file mode 100644 index 0000000..1b406a9 --- /dev/null +++ b/docs/api/core/client.md @@ -0,0 +1,3 @@ +# Core Engine + +::: pyspark_msgraph_source.core.base_client diff --git a/docs/api/core/models.md b/docs/api/core/models.md new file mode 100644 index 0000000..396ba36 --- /dev/null +++ b/docs/api/core/models.md @@ -0,0 +1,3 @@ +# Core Engine + +::: pyspark_msgraph_source.core.models diff --git a/docs/api/core/resource-provider.md b/docs/api/core/resource-provider.md new file mode 100644 index 0000000..ba837e5 --- /dev/null +++ b/docs/api/core/resource-provider.md @@ -0,0 +1,3 @@ +# Core Engine + +::: pyspark_msgraph_source.core.resource_provider diff --git a/docs/api/core/utils.md b/docs/api/core/utils.md new file mode 100644 index 0000000..231bd80 --- /dev/null +++ b/docs/api/core/utils.md @@ -0,0 +1,3 @@ +# Core Engine + +::: pyspark_msgraph_source.core.utils diff --git a/mkdocs.yml b/mkdocs.yml index 8109506..ae842f2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Pyspark MSGraph Source +site_name: PySpark MSGraph Source theme: name: material @@ -7,12 +7,18 @@ plugins: - mkdocstrings: handlers: python: - paths: ["src/"] # or wherever your package code is + paths: ["src/"] options: - show_source: true + show_source: false nav: - Home: index.md - API Reference: - Overview: api/index.md - - Core: api/core.md + - Core: + - Source: api/core/source.md + - Base Client: api/core/client.md + - Resource Provider: api/core/resource-provider.md + - Models: api/core/models.md + - Async Iterator: api/core/async-iterator.md + - Utils: api/core/utils.md diff --git a/src/pyspark_msgraph_source/core/async_interator.py b/src/pyspark_msgraph_source/core/async_iterator.py similarity index 94% rename from src/pyspark_msgraph_source/core/async_interator.py rename to src/pyspark_msgraph_source/core/async_iterator.py index b2c121a..4216dd9 100644 --- a/src/pyspark_msgraph_source/core/async_interator.py +++ b/src/pyspark_msgraph_source/core/async_iterator.py @@ -9,6 +9,8 @@ class AsyncToSyncIterator: """ Converts an async generator into a synchronous iterator while ensuring proper event loop handling. + + This is required because Microsoft Graph SDK for Python(https://github.com/microsoftgraph/msgraph-sdk-python) is async first. """ def __init__(self, async_gen: AsyncGenerator[Any, None]): diff --git a/src/pyspark_msgraph_source/core/base_client.py b/src/pyspark_msgraph_source/core/base_client.py index 56bc709..ec4b162 100644 --- a/src/pyspark_msgraph_source/core/base_client.py +++ b/src/pyspark_msgraph_source/core/base_client.py @@ -3,18 +3,47 @@ from msgraph import GraphServiceClient from kiota_abstractions.base_request_configuration import RequestConfiguration from msgraph.generated.models.o_data_errors.o_data_error import ODataError -from pyspark_msgraph_source.core.async_interator import AsyncToSyncIterator +from pyspark_msgraph_source.core.async_iterator import AsyncToSyncIterator from pyspark_msgraph_source.core.models import BaseResource from pyspark_msgraph_source.core.utils import get_python_schema, to_json, to_pyspark_schema -from azure.identity import DefaultAzureCredential, EnvironmentCredential +from azure.identity import DefaultAzureCredential + class BaseResourceProvider(ABC): + """ + Abstract base class to handle fetching data from Microsoft Graph API and + provide schema extraction for resources. + """ + def __init__(self, options: Dict[str, Any]): - """ - Initializes the fetcher with the Graph client, resource path, and query parameters. + """ + Initializes the resource provider with Graph client and options. + + This sets up the Microsoft Graph client using `DefaultAzureCredential`, + which automatically handles Azure Active Directory (AAD) authentication + by trying multiple credential types in a fixed order, such as: + + - Environment variables + - Managed Identity (for Azure-hosted environments) + - Azure CLI credentials + - Visual Studio Code login + - Interactive browser login (if applicable) + + This allows seamless local development and production deployments + without code changes to the authentication mechanism. + + See Also: + defaultazurecredential: + https://learn.microsoft.com/en-us/python/api/azure-identity/azure.identity.defaultazurecredential - :param options: Connector options. + Args: + options (Dict[str, Any]): Connector options including authentication + details and resource configurations. + + Raises: + CredentialUnavailableError: If no valid credentials are found during + authentication. """ self.options = options credentials = DefaultAzureCredential() @@ -22,14 +51,26 @@ def __init__(self, options: Dict[str, Any]): async def fetch_data(self): """ - Fetches data from Microsoft Graph using the dynamically built request. - Handles pagination automatically. + Asynchronously fetches data from Microsoft Graph API with automatic + pagination handling. + + Yields: + Any: Each record fetched from the API. + + Raises: + ValueError: If the resource query parameters cannot be instantiated. + AttributeError: If invalid query parameters are provided. + Exception: If a Graph API error occurs. + + Example: + async for record in provider.fetch_data(): + print(record) """ query_parameters_cls = self.resource.get_query_parameters_cls() if query_parameters_cls: try: - query_parameters_instance = query_parameters_cls() # Ensure it can be instantiated without arguments + query_parameters_instance = query_parameters_cls() except TypeError as e: raise ValueError(f"Failed to instantiate {query_parameters_cls.__name__}: {e}") @@ -37,16 +78,19 @@ async def fetch_data(self): for k, v in self.resource.query_params.items(): k = k.removeprefix("%24") if hasattr(query_parameters_instance, k): - setattr(query_parameters_instance, k, v) # Set attributes dynamically + setattr(query_parameters_instance, k, v) else: raise AttributeError(f"{query_parameters_cls.__name__} has no attribute '{k}'") - + request_configuration = RequestConfiguration( query_parameters=query_parameters_instance ) - + try: - builder = self.resource.get_request_builder_cls()(self.graph_client.request_adapter, self.resource.resource_params) + builder = self.resource.get_request_builder_cls()( + self.graph_client.request_adapter, + self.resource.resource_params + ) items = await builder.get(request_configuration=request_configuration) while True: for item in items.value: @@ -60,24 +104,36 @@ async def fetch_data(self): def iter_records(self): """ - Iterates over records from the Microsoft Graph API. + Provides a synchronous iterator over records from the Microsoft Graph API. + + Returns: + Iterator[Any]: Synchronous iterator over the fetched records. + + Raises: + ValueError: If required credentials or resource parameters are missing. + Exception: If the API request fails. - :param options: Connector options containing authentication credentials and resource details. - :return: A synchronous iterator over the fetched data. - :raises ValueError: If any required credentials or resource parameters are missing. - :raises GraphAPIError: If the API request fails. + Example: + for record in provider.iter_records(): + print(record) """ async_gen = self.fetch_data() return AsyncToSyncIterator(async_gen) def get_resource_schema(self) -> Dict[str, Any]: """ - Retrieves the schema of a Microsoft Graph API resource by fetching a single record. + Retrieves the schema of a Microsoft Graph API resource by sampling a record. + + Returns: + Tuple[Dict[str, Any], StructType]: A tuple containing the sample record + and its corresponding PySpark schema. + + Raises: + ValueError: If no records are found or required options are missing. + Exception: If the API request fails. - :param options: Connector options containing authentication credentials and resource details. - :return: A dictionary representing the schema of the resource. - :raises ValueError: If no records are found or if required options are missing. - :raises GraphAPIError: If the API request fails. + Example: + record, schema = provider.get_resource_schema() """ async_gen = self.fetch_data() @@ -88,10 +144,17 @@ def get_resource_schema(self) -> Dict[str, Any]: record = to_json(record) schema = to_pyspark_schema(get_python_schema(record)) return record, schema - + except StopIteration: raise ValueError(f"No records available for {self.resource.resource_name}") - + @abstractmethod def resource(self) -> BaseResource: - ... \ No newline at end of file + """ + Abstract property that must be implemented to provide the resource + configuration. + + Returns: + BaseResource: The resource definition to use for fetching data. + """ + ... diff --git a/src/pyspark_msgraph_source/core/models.py b/src/pyspark_msgraph_source/core/models.py index 7651816..7ee6357 100644 --- a/src/pyspark_msgraph_source/core/models.py +++ b/src/pyspark_msgraph_source/core/models.py @@ -8,10 +8,24 @@ from urllib.parse import unquote from kiota_abstractions.base_request_builder import BaseRequestBuilder + @dataclass class BaseResource: - name: str # User friendly name for Spark reader - resource_name: str # Microsoft Graph leaf resource name + """ + Represents a resource from Microsoft Graph API, such as list_items, users, etc. + + Attributes: + name (str): User-friendly name for the Spark reader. + resource_name (str): Microsoft Graph leaf resource name (e.g., users, items). + request_builder_module (str): Module path of the request builder class from the MSGraph Python SDK. + query_params (Dict[str, Any], optional): Extracted query parameters from the URL template. + resource_params (Dict[str, Any], optional): Extracted path parameters from the URL template. + request_builder_cls_name (str, optional): PascalCase name of the request builder class. + request_builder_query_cls_name (str, optional): PascalCase name of the request builder's query parameters class. + """ + + name: str + resource_name: str request_builder_module: str query_params: Dict[str, Any] = None resource_params: Dict[str, Any] = None @@ -19,27 +33,42 @@ class BaseResource: request_builder_query_cls_name: str = None def __post_init__(self): + """ + Initializes derived attributes and parses the URL template. + + Raises: + ValueError: If the 'name' attribute is not provided. + """ if not self.name: raise ValueError("name is required") - + self.request_builder_cls_name = self._pascal_case(f"{self.resource_name}_request_builder") - #self.request_builder_cls = self.get_request_builder_cls() self.request_builder_query_cls_name = self._pascal_case(f"{self.resource_name}_request_builder_get_query_parameters") - #self.query_parameters_cls = self.get_query_parameters_cls() self.parse_url_template() - @classmethod def _pascal_case(cls, snake_str: str) -> str: """ - Converts snake_case to PascalCase. - Example: "items_request_builder" -> "ItemsRequestBuilder" + Converts a snake_case string to PascalCase. + + Args: + snake_str (str): The snake_case string to convert. + + Returns: + str: PascalCase formatted string. """ return "".join(word.title() for word in snake_str.split("_")) - + def get_query_parameters_cls(self): """ Retrieves the query parameters class from the request builder module. + + Returns: + Any: Query parameters class object. + + Raises: + ImportError: If the request builder module is not found. + AttributeError: If the required class is not found. """ try: module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") @@ -48,19 +77,24 @@ def get_query_parameters_cls(self): if not request_builder_cls or not issubclass(request_builder_cls, BaseRequestBuilder): raise AttributeError(f"{self.request_builder_cls_name} not found in {module.__name__}") - # Inspect the attributes to find the query parameters class - for attr in dir(request_builder_cls): if attr == self.request_builder_query_cls_name: return getattr(request_builder_cls, attr) - raise AttributeError(f"{self.request_builder_query_cls_name} not found in {module.__name__}") - + raise AttributeError(f"{self.request_builder_query_cls_name} not found in {module.__name__}") + except ModuleNotFoundError: raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") def get_request_builder_cls(self) -> BaseRequestBuilder: """ - Dynamically imports a module and finds the RequestBuilder class. + Dynamically imports a module and retrieves the request builder class. + + Returns: + BaseRequestBuilder: The request builder class. + + Raises: + ImportError: If the module is not found. + AttributeError: If the class is not valid. """ try: module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") @@ -72,85 +106,89 @@ def get_request_builder_cls(self) -> BaseRequestBuilder: return cls except ImportError: raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") - + def get_request_builder_url_template(self): """ - Extracts the `url_template` by analyzing the source code of the class. + Extracts the URL template from the request builder class's __init__ method. + + Returns: + str: URL template string. + + Raises: + TypeError: If the URL template cannot be extracted. """ try: cls = self.get_request_builder_cls() if inspect.isclass(cls) and hasattr(cls, "__init__"): - # Extract the __init__ function source code init_source = inspect.getsource(cls.__init__) if "super().__init__(" in init_source: - lines = init_source.split("\n") - for line in lines: + for line in init_source.split("\n"): if "super().__init__(" in line: match = re.search(r'super\(\).__init__\s*\([^,]+,\s*"([^"]+)"', line) if match: - url_template = match.group(1).replace('"', "").replace("'", "") - return url_template - + return match.group(1).replace('"', "").replace("'", "") except TypeError: raise TypeError(f"Error extracting URL template from {cls.__name__}") def parse_url_template(self): """ - Parses the `url_template` string to extract path parameters and query parameters. + Parses the URL template to extract path and query parameters. + + Raises: + ValueError: If the URL template is not found. """ url_template = self.get_request_builder_url_template() if not url_template: raise ValueError("URL template not found in request builder class") - # Extract path parameters (decode %2Did → _id) path_parameters = [ unquote(match.group(1)).replace("%2D", "_") for match in re.finditer(r"\{([^?}]+)\}", url_template) if match.group(1).lower() != "+baseurl" ] - # Extract query parameters (decode %24expand → $expand) query_match = re.search(r"\{\?([^}]+)\}", url_template) query_parameters = ( [unquote(q).replace("%24", "$") for q in query_match.group(1).split(",")] if query_match else [] ) - self.resource_params = {k:None for k in path_parameters} + self.resource_params = {k: None for k in path_parameters} self.query_params = {qp.strip().replace("$", ""): None for qp in query_parameters} - def map_options_to_params(self, options: Dict[str, Any]) -> 'BaseResource': """ - Maps the provided options to either query parameters or resource parameters. + Maps provided options to valid query and resource parameters. + + Args: + options (Dict[str, Any]): User-provided options. - :param options: Dictionary of options provided by the user. - :param query_params: List of valid query parameter names. - :param resource_params: List of valid resource parameter names. - :return: A tuple (mapped_query_params, mapped_resource_params, invalid_params) + Returns: + BaseResource: Updated instance with mapped parameters. + + Raises: + ValueError: If required resource parameters are missing or extra parameters are provided. """ missing_params = [param for param in self.resource_params if param not in options] if missing_params: raise ValueError(f"Missing required resource parameters: {', '.join(missing_params)}") - # TODO: add max $top value validation. if int(options.get("top", 1)) <= 100: logging.warning("Setting a low `top` value in Microsoft Graph queries can cause high latency and increase throttling risk.") - mapped_query_params = {"%24"+k: v for k, v in options.items() if k in self.query_params} + mapped_query_params = {"%24" + k: v for k, v in options.items() if k in self.query_params} mapped_resource_params = {k.replace("-", "%2D"): v for k, v in options.items() if k in self.resource_params} - + invalid_params = {k: v for k, v in options.items() if k not in self.query_params and k not in self.resource_params} - - if len(invalid_params) > 0: + + if invalid_params: raise ValueError(f"Extra parameters {invalid_params} not allowed.") - + self.query_params = mapped_query_params self.resource_params = mapped_resource_params - + return self - GUID_PATTERN = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") @@ -158,20 +196,26 @@ def map_options_to_params(self, options: Dict[str, Any]) -> 'BaseResource': @dataclass class ConnectorOptions: - """Options for Microsoft Graph API requests with strict resource_path validation.""" + """ + Options for Microsoft Graph API requests with strict credential validation. + + Attributes: + tenant_id (str): Azure tenant ID (GUID). + client_id (str): Azure client ID (GUID). + client_secret (str): Azure client secret. + """ tenant_id: str client_id: str client_secret: str + def __post_init__(self): ... - + def _validate_credentials(self): - """Validates the format and presence of credentials.""" - if not self.tenant_id or not GUID_PATTERN.match(self.tenant_id): - raise ValueError("Invalid tenant_id: must be a valid GUID.") - - if not self.client_id or not GUID_PATTERN.match(self.client_id): - raise ValueError("Invalid client_id: must be a valid GUID.") - - if not self.client_secret or not isinstance(self.client_secret, str): - raise ValueError("Invalid client_secret: must be a non-empty string.") \ No newline at end of file + """ + Validates the format and presence of credentials. + + Raises: + ValueError: If any credential is invalid or missing. + """ + ... diff --git a/src/pyspark_msgraph_source/core/resource_provider.py b/src/pyspark_msgraph_source/core/resource_provider.py index bc72ac6..468e5d1 100644 --- a/src/pyspark_msgraph_source/core/resource_provider.py +++ b/src/pyspark_msgraph_source/core/resource_provider.py @@ -5,19 +5,34 @@ from typing import Dict, Type from pyspark_msgraph_source.core.base_client import BaseResourceProvider + # @lru_cache(maxsize=10) def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: """ - Dynamically loads all resource providers from the resources package + Dynamically loads all resource providers from the `resources` package. + + This function scans the `resources` subpackage of the current root package, + discovers all modules (excluding `base.py`), and imports any classes ending + with `ResourceProvider` that are subclasses of `BaseResourceProvider`. + + This allows dynamic discovery and registration of new resource providers + without requiring explicit imports. + + Returns: + Dict[str, Type[BaseResourceProvider]]: A dictionary mapping resource + names (module names) to their corresponding resource provider classes. + + Example: + providers = load_resource_providers() + print(providers.keys()) """ providers = {} root_package = __package__.split('.')[0] logging.debug(f"Current root package {root_package}.") - + package = f'{root_package}.resources' - resources_pkg = importlib.import_module(package) - + for _, name, _ in pkgutil.iter_modules(resources_pkg.__path__): if name != 'base': # Skip the base module try: @@ -25,27 +40,49 @@ def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: for attr_name in dir(module): if attr_name.endswith('ResourceProvider'): provider_class = getattr(module, attr_name) - if (isinstance(provider_class, type) and - issubclass(provider_class, BaseResourceProvider) and + if (isinstance(provider_class, type) and + issubclass(provider_class, BaseResourceProvider) and provider_class != BaseResourceProvider): providers[name] = provider_class except ImportError as e: print(f"Warning: Could not load resource provider {name}: {e}") - - return frozenset(providers.items()) + + return providers + # @lru_cache(maxsize=10) def get_resource_provider(resource_name: str, options: frozenset) -> BaseResourceProvider: """ - Factory method to get the appropriate resource provider + Factory method to retrieve the appropriate resource provider based on its name. + + This function looks up the resource provider class registered in + `load_resource_providers()`, instantiates it with the provided options, + and returns the instance. + + Args: + resource_name (str): The name of the resource (typically the module name). + options (frozenset): A frozenset of key-value pairs representing the + configuration options for the provider. + + Returns: + BaseResourceProvider: An instance of the corresponding resource provider. + + Raises: + ValueError: If the requested resource name is not found in the + available providers. + + Example: + provider = get_resource_provider('users', frozenset({'tenant_id': 'xxx'}.items())) + for record in provider.iter_records(): + print(record) """ providers = dict(load_resource_providers()) provider_class: BaseResourceProvider = providers.get(resource_name) - + if not provider_class: available = ', '.join(providers.keys()) raise ValueError( f"Unsupported resource name: '{resource_name}'. " f"Available resources: {available}" ) - return provider_class(dict(options)) \ No newline at end of file + return provider_class(dict(options)) diff --git a/src/pyspark_msgraph_source/core/source.py b/src/pyspark_msgraph_source/core/source.py index efa4660..c5e85a6 100644 --- a/src/pyspark_msgraph_source/core/source.py +++ b/src/pyspark_msgraph_source/core/source.py @@ -1,52 +1,127 @@ import logging -from typing import Any, Dict, Union +from typing import Any, Dict, Iterator, Tuple, Union from pyspark.sql.datasource import DataSource, DataSourceReader from pyspark.sql.types import StructType from pyspark_msgraph_source.core.base_client import BaseResourceProvider - from pyspark_msgraph_source.core.resource_provider import get_resource_provider -# Reference https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources +# Reference: https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources logger = logging.getLogger(__name__) + class MSGraphDataSource(DataSource): """ + A custom PySpark DataSource implementation to read data from Microsoft Graph API. + + This datasource uses dynamic resource providers to connect to different + Microsoft Graph resources based on the `resource` option. + + If schema inference is required, it fetches sample data to infer the schema. + + See Also: + Databricks PySpark DataSource API: + https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources + + Args: + options (Dict[str, Any]): Connector options, including the required + `resource` name and authentication parameters. + + Raises: + ValueError: If the `resource` option is missing. + Example: + df = spark.read.format("msgraph") \ + .option("resource", "list_items") \ + .option("site-id", "") \ + .option("list-id", "") \ + .option("top", 999) \ + .option("expand", "fields") \ + .load() + + df.show() """ + def __init__(self, options: Dict[str, Any]): - - self.resource_name = options.pop("resource") + self.resource_name = options.pop("resource", None) if not self.resource_name: raise ValueError("resource is missing, please provide a valid resource name.") self.options = frozenset(options.items()) - + @classmethod - def name(cls): + def name(cls) -> str: + """ + Returns the registered name of the DataSource. + + Returns: + str: The name of the DataSource, "msgraph". + """ return "msgraph" - + def schema(self): - logger.info("Schema not provided, infering from the source.") - resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + """ + Infers the schema of the Microsoft Graph resource. + + This will call the corresponding resource provider to fetch a sample + record and determine its schema. + + Returns: + StructType: The inferred schema of the resource. + """ + logger.info("Schema not provided, inferring from the source.") + resource_provider: BaseResourceProvider = get_resource_provider(self.resource_name, self.options) _, schema = resource_provider.get_resource_schema() - logger.debug(f"Infered schema : {schema}") + logger.debug(f"Inferred schema: {schema}") return schema - def reader(self, schema: StructType): + def reader(self, schema: StructType) -> "MSGraphDataSourceReader": + """ + Provides the DataSourceReader to read data. + + Args: + schema (StructType): The schema to apply to the records. + + Returns: + MSGraphDataSourceReader: The configured reader for this resource. + """ return MSGraphDataSourceReader(self.resource_name, self.options, schema) class MSGraphDataSourceReader(DataSourceReader): + """ + A DataSourceReader to fetch records from a Microsoft Graph resource. + + This reader uses the resource provider to iterate over records and + yields rows compatible with the provided schema. + + Args: + resource_name (str): The name of the Microsoft Graph resource. + options (frozenset): Connector options. + schema (Union[StructType, str]): The schema to apply to the records. + """ - def __init__(self, resource_name :str, options: frozenset, schema: Union[StructType, str]): + def __init__(self, resource_name: str, options: frozenset, schema: Union[StructType, str]): self.schema: StructType = schema self.options = options self.resource_name = resource_name - - def read(self, partition): + + def read(self, partition) -> Union[Iterator[Tuple], Iterator["RecordBatch"]]: # type: ignore + """ + Reads records from the Microsoft Graph API. + + For each record fetched from the resource provider, it transforms + the record into a PySpark Row object matching the schema. + + Args: + partition: Unused in this implementation (for future partitioning support). + + Yields: + Row: A PySpark Row object for each record. + """ from pyspark_msgraph_source.core.utils import to_json from pyspark.sql import Row - resource_provider:BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + + resource_provider: BaseResourceProvider = get_resource_provider(self.resource_name, self.options) for row in resource_provider.iter_records(): row = to_json(row) row_data = {f.name: row.get(f.name, None) for f in self.schema.fields} diff --git a/src/pyspark_msgraph_source/core/utils.py b/src/pyspark_msgraph_source/core/utils.py index b878c2a..7f9d658 100644 --- a/src/pyspark_msgraph_source/core/utils.py +++ b/src/pyspark_msgraph_source/core/utils.py @@ -1,10 +1,10 @@ -from typing import Any +from typing import Any, Dict, List, Union from kiota_serialization_json.json_serialization_writer_factory import JsonSerializationWriterFactory import json from pyspark.sql.types import ( StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, - MapType, ArrayType, TimestampType, DateType, LongType, BinaryType, DecimalType + ArrayType, TimestampType, DateType, LongType, BinaryType, DecimalType, DataType ) from datetime import datetime, date @@ -12,26 +12,37 @@ # Convert to JSON using Kiota writer_factory = JsonSerializationWriterFactory() -writer = writer_factory.get_serialization_writer("application/json") -def to_json(value): - value.serialize(writer) - # Get JSON string - return json.loads((writer.get_serialized_content().decode("utf-8"))) -def to_jsonValue(value): - value.serialize(writer) - # Get JSON string - return str(json.loads((writer.get_serialized_content().decode("utf-8")))) +def to_json(value: Any) -> Dict[str, Any]: + """ + Serializes a Kiota serializable object to a JSON-compatible dictionary. + Args: + value (Any): An object that implements the Kiota serialization interface. + + Returns: + dict: A dictionary representing the serialized JSON content. + """ + writer = writer_factory.get_serialization_writer("application/json") + value.serialize(writer) + return json.loads(writer.get_serialized_content().decode("utf-8")) -def get_python_schema(obj:Any): +def get_python_schema( + obj: Any +) -> Union[str, Dict[str, Any], List[Any]]: """ Recursively extracts the schema from a Python object. - :param obj: The Python object (dict, list, int, str, etc.). - :return: A schema dictionary representing field types. + Args: + obj (Any): The Python object (e.g., dict, list, int, str) to analyze. + + Returns: + Union[str, dict, list]: A nested schema representing the object's structure and field types. + - For dicts: a dict with key-value schemas. + - For lists: a list with the schema of the first element or "any" if empty. + - For primitives: a string indicating the type ("str", "int", etc.). """ if isinstance(obj, bool): return "bool" @@ -57,14 +68,35 @@ def get_python_schema(obj:Any): return "null" return "unknown" # Fallback for unrecognized types -def to_pyspark_schema(schema_dict): + +def to_pyspark_schema( + schema_dict: Dict[str, Any] +) -> StructType: """ Recursively converts a nested Python schema dictionary to a PySpark StructType schema. - :param schema_dict: Dictionary with field names as keys and data types as values. - :return: PySpark StructType schema. + Args: + schema_dict (dict): A dictionary with field names as keys and data types as values, + where types are represented as strings (e.g., "str", "int", "bool"). + Nested dictionaries represent nested StructTypes. + + Returns: + StructType: A PySpark StructType schema reflecting the provided structure. + + Example: + Input: + {"name": "str", "age": "int", "scores": ["float"], "address": {"city": "str"}} + Output: + StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("scores", ArrayType(DoubleType()), True), + StructField("address", StructType([ + StructField("city", StringType(), True) + ]), True) + ]) """ - type_mapping = { + type_mapping: Dict[str, DataType] = { "str": StringType(), "int": IntegerType(), "float": DoubleType(), @@ -74,11 +106,20 @@ def to_pyspark_schema(schema_dict): "long": LongType(), "binary": BinaryType(), "decimal": DecimalType(38, 18), - "unknown": StringType() + "null": StringType(), + "unknown": StringType(), } - def convert_type(value): - """Recursively converts types, handling nested dicts and lists.""" + def convert_type(value: Any) -> DataType: + """ + Recursively converts type descriptors to PySpark data types. + + Args: + value (Any): The type descriptor (str, dict, list). + + Returns: + DataType: The corresponding PySpark data type. + """ if isinstance(value, dict): # Nested structure return StructType([StructField(k, convert_type(v), True) for k, v in value.items()]) elif isinstance(value, list): # List of elements (assume first element type) @@ -87,5 +128,7 @@ def convert_type(value): return ArrayType(convert_type(value[0])) return type_mapping.get(value, StringType()) # Default to StringType - struct_fields = [StructField(field, convert_type(dtype), True) for field, dtype in schema_dict.items()] - return StructType(struct_fields) \ No newline at end of file + struct_fields: List[StructField] = [ + StructField(field, convert_type(dtype), True) for field, dtype in schema_dict.items() + ] + return StructType(struct_fields) From ce161bcdf312db67c672456e72b6a89bbbe712e2 Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 15:06:38 +0000 Subject: [PATCH 08/10] add documentation --- README.md | 198 +++++++----------- docs/api/core/async-iterator.md | 2 +- docs/api/core/client.md | 2 +- docs/api/core/models.md | 2 +- docs/api/core/resource-provider.md | 2 +- docs/api/core/source.md | 3 + docs/api/core/utils.md | 2 +- docs/api/resources/index.md | 33 +++ docs/api/resources/list-items.md | 4 + docs/getting-started.md | 0 docs/guides/list-items.md | 94 +++++++++ docs/index.md | 88 ++++++-- mkdocs.yml | 5 + .../resources/list_items.py | 50 ++++- 14 files changed, 342 insertions(+), 143 deletions(-) create mode 100644 docs/api/core/source.md create mode 100644 docs/api/resources/index.md create mode 100644 docs/api/resources/list-items.md create mode 100644 docs/getting-started.md create mode 100644 docs/guides/list-items.md diff --git a/README.md b/README.md index 4ea60d4..ebd5e3e 100644 --- a/README.md +++ b/README.md @@ -1,163 +1,123 @@ -# Apache PySpark Custom Data Source Template -This repository provides a template for creating a custom data source for Apache PySpark. It is designed to help developers extend PySpark’s data source API to support custom data ingestion and storage mechanisms. +# pyspark-msgraph-source +A **PySpark DataSource** to seamlessly integrate and read data from **Microsoft Graph API**, enabling easy access to resources like **SharePoint List Items**, and more. -## Motivation - -When developing custom PySpark data sources, I encountered several challenges that made the development process frustrating: - -1. **Environment Setup Complexity**: Setting up a development environment for PySpark data source development was unnecessarily complex, with multiple dependencies and version conflicts. - -2. **Test Data Management**: Managing test data and maintaining consistent test environments across different machines was challenging. - -3. **Debugging Issues**: The default setup made it difficult to debug custom data source code effectively, especially when dealing with Spark's distributed nature. - -4. **Documentation Gaps**: Existing documentation for custom data source development was scattered and often incomplete. - -This template repository aims to solve these pain points and provide a streamlined development experience. - +--- ## Features +- Entra ID Authentication +Securely authenticate with Microsoft Graph using DefaultAzureCredential, supporting local development and production seamlessly. -- Pre-configured development environment -- Ready-to-use test infrastructure -- Example implementation -- Automated tests setup -- Debug-friendly configuration - -## Getting Started - -Follow these steps to set up and use this repository: +- Automatic Pagination Handling +Fetches all paginated data from Microsoft Graph without manual intervention. -### Prerequisites +- Dynamic Schema Inference +Automatically detects the schema of the resource by sampling data, so you don't need to define it manually. -- Docker -- Visual Studio Code -- Python 3.11 +- Simple Configuration with .option() +Easily configure resources and query parameters directly in your Spark read options, making it flexible and intuitive. -### Creating a Repository from This Template +- Zero External Ingestion Services +No additional services like Azure Data Factory or Logic Apps are needed—directly ingest data into Spark from Microsoft Graph. -To create a new repository based on this template: +- Extensible Resource Providers +Add custom resource providers to support more Microsoft Graph endpoints as needed. -1. Go to the [GitHub repository](https://github.com/geekwhocodes/pyspark-custom-datasource-template). -2. Click the **Use this template** button. -3. Select **Create a new repository**. -4. Choose a repository name, visibility (public or private), and click **Create repository from template**. -5. Clone your new repository: +- Pluggable Architecture +Dynamically load resource providers without modifying core logic. - ```sh - git clone https://github.com/your-username/your-new-repository.git - cd your-new-repository - ``` +- Optimized for PySpark +Designed to work natively with Spark's DataFrame API for big data processing. -### Setup +- Secure by Design +Credentials and secrets are handled using Azure Identity best practices, avoiding hardcoding sensitive data. -1. **Open the repository in Visual Studio Code:** +--- - ```sh - code . - ``` +## Installation -2. **Build and start the development container:** - - Open the command palette (Ctrl+Shift+P) and select `Remote-Containers: Reopen in Container`. +```bash +pip install pyspark-msgraph-source +``` -3. **Initialize the environment:** +--- - The environment will be initialized automatically by running the `init-env.sh` script defined in the `devcontainer.json` file. +## ⚡ Quickstart -### Project Structure +### 1. Authentication -The project follows this structure: +This package uses [DefaultAzureCredential](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential). +Ensure you're authenticated: +```bash +az login ``` -. -├── src/ -│ ├── fake_source/ # Default fake data source implementation -│ │ ├── __init__.py -│ │ ├── source.py # Implementation of the fake data source -│ │ ├── schema.py # Schema definitions (if applicable) -│ │ └── utils.py # Helper functions (if needed) -│ ├── tests/ # Unit tests for the custom data source -│ │ ├── __init__.py -│ │ ├── test_source.py # Tests for the data source -│ │ └── conftest.py # Test configuration and fixtures -├── .devcontainer/ # Development container setup files -│ ├── Dockerfile -│ ├── devcontainer.json -├── |── scripts -├── | ├── init-env.sh # Initialization script for setting up the environment -├── pyproject.toml # Project dependencies and build system configuration -├── README.md # Project documentation -├── LICENSE # License file -``` - -### Usage - -By default, this template includes a **fake data source** that generates mock data. You can use it as-is or replace it with your own implementation. -1. **Register the custom data source:** - - ```python - from pyspark.sql import SparkSession - from fake_source.source import FakeDataSource - - spark = SparkSession.builder.getOrCreate() - spark.dataSource.register(FakeDataSource) - ``` +Or set environment variables: +```bash +export AZURE_CLIENT_ID= +export AZURE_TENANT_ID= +export AZURE_CLIENT_SECRET= +``` -2. **Read data using the custom data source:** +### 2. Example Usage - ```python - df = spark.read.format("fake").load() - df.show() - ``` +```python +from pyspark.sql import SparkSession -3. **Run tests:** +spark = SparkSession.builder \ +.appName("MSGraphExample") \ +.getOrCreate() - ```sh - pytest - ``` +from pyspark_msgraph_source.core.source import MSGraphDataSource +spark.dataSource.register(MSGraphDataSource) -### Customization +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.load() -To replace the fake data source with your own: +df.show() +``` -1. **Rename the package folder:** +--- - ```sh - mv src/fake_source src/your_datasource_name - ``` +## Supported Resources -2. **Update imports in `source.py` and other files:** +| Resource | Description | +|--------------|-----------------------------| +| `list_items`| SharePoint List Items | +| *(more coming soon...)* | | - ```python - from your_datasource_name.source import CustomDataSource - ``` +--- -3. **Update `pyproject.toml` to reflect the new package name.** +## Development -4. **Modify the schema and options in `source.py` to fit your use case.** +Coming soon... -### References -1. [Microsoft Learn - PySpark custom data sources](https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources) +--- -### License +## Troubleshooting -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. +| Issue | Solution | +|---------------------------------|----------------------------------------------| +| `ValueError: resource missing` | Add `.option("resource", "list_items")` | +| Empty dataframe | Verify IDs, permissions, and access | +| Authentication failures | Check Azure credentials and login status | -### Contact +--- -For issues and questions, please use the GitHub Issues section. +## 📄 License +[MIT License](LICENSE) -### Need Help Setting Up a Data Intelligence Platform with Databricks? -If you need expert guidance on setting up a modern data intelligence platform using Databricks, we can help. Our consultancy specializes in: +--- -- Custom data source development for Databricks and Apache Spark -- Optimizing ETL pipelines for performance and scalability -- Data governance and security using Unity Catalog -- Building ML & AI solutions on Databricks +## 📚 Resources -🚀 [Contact us](https://www.linkedin.com/in/geekwhocodes/) for a consultation and take your data platform to the next level. +- [Microsoft Graph API](https://learn.microsoft.com/en-us/graph/overview) +- [DefaultAzureCredential](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential) diff --git a/docs/api/core/async-iterator.md b/docs/api/core/async-iterator.md index d280bce..986e6d1 100644 --- a/docs/api/core/async-iterator.md +++ b/docs/api/core/async-iterator.md @@ -1,3 +1,3 @@ -# Core Engine +# Async To Sync Iterator ::: pyspark_msgraph_source.core.async_iterator diff --git a/docs/api/core/client.md b/docs/api/core/client.md index 1b406a9..f8a05a2 100644 --- a/docs/api/core/client.md +++ b/docs/api/core/client.md @@ -1,3 +1,3 @@ -# Core Engine +# Base Client ::: pyspark_msgraph_source.core.base_client diff --git a/docs/api/core/models.md b/docs/api/core/models.md index 396ba36..2d8f907 100644 --- a/docs/api/core/models.md +++ b/docs/api/core/models.md @@ -1,3 +1,3 @@ -# Core Engine +# Core Models ::: pyspark_msgraph_source.core.models diff --git a/docs/api/core/resource-provider.md b/docs/api/core/resource-provider.md index ba837e5..a063ce1 100644 --- a/docs/api/core/resource-provider.md +++ b/docs/api/core/resource-provider.md @@ -1,3 +1,3 @@ -# Core Engine +# Resouorce Provider ::: pyspark_msgraph_source.core.resource_provider diff --git a/docs/api/core/source.md b/docs/api/core/source.md new file mode 100644 index 0000000..3b05041 --- /dev/null +++ b/docs/api/core/source.md @@ -0,0 +1,3 @@ +# Source + +::: pyspark_msgraph_source.core.source diff --git a/docs/api/core/utils.md b/docs/api/core/utils.md index 231bd80..a51b054 100644 --- a/docs/api/core/utils.md +++ b/docs/api/core/utils.md @@ -1,3 +1,3 @@ -# Core Engine +# Utils ::: pyspark_msgraph_source.core.utils diff --git a/docs/api/resources/index.md b/docs/api/resources/index.md new file mode 100644 index 0000000..ef87d05 --- /dev/null +++ b/docs/api/resources/index.md @@ -0,0 +1,33 @@ + +# Available Resources + +This page lists the Microsoft Graph resources currently supported by the `pyspark-msgraph-source` connector. + +--- + +## Supported Resources + +| Resource Name | Description | Read more | +|---------------|-------------|------------------| +| `list_items` | Retrieves items from a SharePoint List | [Configuration](list-items.md) | + +--- + +## Adding New Resources + +Want to add support for more resources? +Check out the [Contributing Guide](contributing.md) to learn how to extend the connector! + +--- + +## Notes +- Resources may require specific Microsoft Graph API permissions. +- Pagination, authentication, and schema inference are handled automatically. + +--- + +## Request New Resources + +Is your desired resource not listed here? +Open an [issue](https://github.com/geekwhocodes/pyspark-msgraph-source/issues) to request it! + diff --git a/docs/api/resources/list-items.md b/docs/api/resources/list-items.md new file mode 100644 index 0000000..78b6c83 --- /dev/null +++ b/docs/api/resources/list-items.md @@ -0,0 +1,4 @@ +# Resource - List Items + + +::: pyspark_msgraph_source.resources.list_items diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/guides/list-items.md b/docs/guides/list-items.md new file mode 100644 index 0000000..7b8a23b --- /dev/null +++ b/docs/guides/list-items.md @@ -0,0 +1,94 @@ +# Reading SharePoint List Items with PySpark + +This guide explains how to read **List Items** from a **SharePoint List** using the `pyspark-msgraph-source` connector and Microsoft Graph API. + +--- + +## Prerequisites +- Microsoft Entra (Azure AD) authentication set up with permissions to access SharePoint lists. +- Required Microsoft Graph API permissions: + - `Sites.Read.All` + - `Lists.Read` +- Installed `pyspark-msgraph-source` package. +- Initialized Spark session. + +--- + +## 🔹 Supported Options for `list_items` + +| Option | Description | Required | +|--------------|-----------------------------------------------------------|----------| +| `resource` | Resource name (must be `"list_items"`) | ✅ Yes | +| `site-id` | The ID of the SharePoint site | ✅ Yes | +| `list-id` | The ID of the list within the SharePoint site | ✅ Yes | +| `top` | (Optional) Number of records to fetch | ❌ No | +| `expand` | (Optional) Related entities to expand (e.g., `"fields"`) | ❌ No | + +> **Note:** You can find `site-id` and `list-id` via Graph API explorer or SharePoint admin tools. + +--- + +## Example Usage + +```python +from pyspark_msgraph_source.core.source import MSGraphDataSource + +# Register the data source (typically required once) +spark.dataSource.register(MSGraphDataSource) + +# Read data from Microsoft Graph +df = spark.read.format("msgraph") \ + .option("resource", "list_items") \ + .option("site-id", "37d7dde8-0b6b-4b7c-a2fd-2e217f54a263") \ + .option("list-id", "5ecf26db-0161-4069-b763-856217415099") \ + .option("top", 111) \ + .option("expand", "fields") \ + .load() + +# Show the results +df.show() +``` + +--- + +## Explanation of Example +- **`spark.read.format("msgraph")`**: Use the Microsoft Graph connector. +- **`.option("resource", "list_items")`**: Specify the resource to fetch SharePoint list items. +- **`.option("site-id", "...")` and `.option("list-id", "...")`**: Provide the SharePoint site and list IDs. +- **`.option("top", 111)`**: Limit the number of records (optional). +- **`.option("expand", "fields")`**: Retrieve additional field details (optional). +- **`.load()`**: Execute the read operation. + +--- + +## Schema Inference +The connector automatically infers the schema by fetching a sample record from the API if you do not provide a schema. + +--- + +## Error Handling +- Missing or invalid `site-id` or `list-id` will raise a `ValueError`. +- API permission errors will raise authentication exceptions. +- Network or Microsoft Graph issues will raise clear, descriptive exceptions. + +--- + +## Notes +- Authentication is handled automatically via [**`DefaultAzureCredential`**](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential), supporting: + - Environment credentials + - Managed Identity + - Azure CLI login + - Visual Studio Code authentication + +- Use `.option("top", N)` to control the number of records retrieved for large datasets. +- To retrieve custom fields, include `.option("expand", "fields")`. + +--- + +## Troubleshooting + +| Issue | Solution | +|-----------------------------------------|-------------------------------------------------| +| `"resource is missing"` error | Ensure `.option("resource", "list_items")` | +| Empty dataframe | Check permissions and ensure valid IDs | +| `"Unsupported resource name"` error | Verify `"list_items"` is supported | \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 000ea34..5752edb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,17 +1,81 @@ -# Welcome to MkDocs -For full documentation visit [mkdocs.org](https://www.mkdocs.org). +# Welcome to **PySpark Microsoft Graph Connector** -## Commands +Unlock seamless data access from **Microsoft Graph API** directly into **Apache Spark** using this connector designed for modern data pipelines. -* `mkdocs new [dir-name]` - Create a new project. -* `mkdocs serve` - Start the live-reloading docs server. -* `mkdocs build` - Build the documentation site. -* `mkdocs -h` - Print help message and exit. +--- -## Project layout +## Why Use This Connector? - mkdocs.yml # The configuration file. - docs/ - index.md # The documentation homepage. - ... # Other markdown pages, images and other files. +Working with Microsoft 365 data—such as SharePoint, Teams, Users, and Planner—has traditionally required intermediate services like Azure Data Factory, Logic Apps, or manual exports. With **`pyspark-msgraph-source`**, you can: + +- Authenticate securely with **Entra ID** using `DefaultAzureCredential` +- Query any supported Microsoft Graph resource directly in Spark +- Automatically handle **pagination**, **dynamic schema inference**, and **large datasets** +- Streamline analytics on Microsoft 365 data without extra infrastructure + +--- + +## What is Microsoft Graph? + +[Microsoft Graph](https://learn.microsoft.com/en-us/graph/overview) is the gateway to data and intelligence in Microsoft 365. It provides unified access to: + +- **Users** +- **Groups** +- **Calendars** +- **SharePoint Lists** +- **Teams Channels** +- **Planner Tasks** +- And much more! + +--- + +## What Can You Build? + +- Reporting and analytics on SharePoint Lists +- Business intelligence dashboards with Microsoft Teams activity +- Enterprise insights from Entra ID (Azure AD) +- And much more! + +--- + +## How Does It Work? + +1. Configure your Microsoft Entra (Azure AD) application. +2. Authenticate with `DefaultAzureCredential`. +3. Load data into Spark using `.read.format("msgraph")`. +4. Query, process, and analyze at scale. + +--- + +## Example + +```python +df = spark.read.format("msgraph") \ + .option("resource", "list_items") \ + .option("site-id", "") \ + .option("list-id", "") \ + .load() + +df.show() +``` + +--- + +## Ready to Get Started? + +- Check out the [Getting Started Guide](getting-started.md) +- Explore available [Resources](api/resources) +- Learn how to [Contribute](contributing.md) + +--- + +## Need Help? + +- Open an [issue](https://github.com/geekwhocodes/pyspark-msgraph-source/issues) +- Start a discussion with the community +- Submit feature requests and improvements + +--- + +Welcome aboard and happy querying! 🚀 diff --git a/mkdocs.yml b/mkdocs.yml index ae842f2..5b48030 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,6 +13,9 @@ plugins: nav: - Home: index.md + - Guides: + - List Items: guides/list-items.md + - Available Resource: api/resources/index.md - API Reference: - Overview: api/index.md - Core: @@ -22,3 +25,5 @@ nav: - Models: api/core/models.md - Async Iterator: api/core/async-iterator.md - Utils: api/core/utils.md + - Resources: + - List Items: api/resources/list-items.md diff --git a/src/pyspark_msgraph_source/resources/list_items.py b/src/pyspark_msgraph_source/resources/list_items.py index e3d1293..153b2c1 100644 --- a/src/pyspark_msgraph_source/resources/list_items.py +++ b/src/pyspark_msgraph_source/resources/list_items.py @@ -5,19 +5,55 @@ from pyspark_msgraph_source.core.base_client import BaseResourceProvider from pyspark_msgraph_source.core.models import BaseResource +logger = logging.getLogger(__name__) + class ListItemsResourceProvider(BaseResourceProvider): + """ + Resource provider for fetching list items from Microsoft Graph API. + + See Also: + https://learn.microsoft.com/en-us/graph/api/listitem-list?view=graph-rest-1.0: + https://learn.microsoft.com/en-us/graph/api/listitem-list?view=graph-rest-1.0 + + + This provider handles the setup of the `list_items` resource, + configuring the request builder and mapping options to the required parameters. + + Args: + options (Dict[str, str]): Connector options, typically containing + site ID, list ID, and any query parameters. + + Example: + provider = ListItemsResourceProvider(options) + for record in provider.iter_records(): + print(record) + """ def __init__(self, options: Dict[str, str]): + """ + Initializes the ListItemsResourceProvider. + + Args: + options (Dict[str, str]): Connector options required to configure + the resource and authenticate requests. + """ self.options = options super().__init__(options) - - @cached_property + + @cached_property def resource(self) -> BaseResource: - return BaseResource( - name="list_items", - resource_name="items", - request_builder_module="sites.item.lists.item.items.items_request_builder" - ).map_options_to_params(self.options) + """ + Returns the BaseResource configuration for list items. + This sets up the request builder path and resource name + required to make API calls to retrieve list items. + Returns: + BaseResource: Configured resource with mapped options. + """ + return BaseResource( + name="list_items", + resource_name="items", + request_builder_module="sites.item.lists.item.items.items_request_builder" + ).map_options_to_params(self.options) From 81591c9cf2dc9a7127bbd188b77fe823e236afef Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 15:14:47 +0000 Subject: [PATCH 09/10] getting started --- README.md | 14 +++++++++ docs/getting-started.md | 61 +++++++++++++++++++++++++++++++++++++++ docs/guides/list-items.md | 2 +- mkdocs.yml | 1 + 4 files changed, 77 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ebd5e3e..a1c65d3 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,20 @@ df = spark.read.format("msgraph") \ .load() df.show() + +# with schema + +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.schema("id string, Title string") +.load() + +df.show() + ``` --- diff --git a/docs/getting-started.md b/docs/getting-started.md index e69de29..8288dc3 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -0,0 +1,61 @@ +## Installation + +```bash +pip install pyspark-msgraph-source +``` + +--- + +### 1. Authentication + +This package uses [DefaultAzureCredential](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential). +Ensure you're authenticated: + +```bash +az login +``` + +Or set environment variables: +```bash +export AZURE_CLIENT_ID= +export AZURE_TENANT_ID= +export AZURE_CLIENT_SECRET= +``` + +### 2. Example Usage + +```python +from pyspark.sql import SparkSession + +spark = SparkSession.builder \ +.appName("MSGraphExample") \ +.getOrCreate() + +from pyspark_msgraph_source.core.source import MSGraphDataSource +spark.dataSource.register(MSGraphDataSource) + +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.load() + +df.show() + +# with schema + +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.schema("id string, Title string") +.load() + +df.show() + + +``` \ No newline at end of file diff --git a/docs/guides/list-items.md b/docs/guides/list-items.md index 7b8a23b..778bb11 100644 --- a/docs/guides/list-items.md +++ b/docs/guides/list-items.md @@ -14,7 +14,7 @@ This guide explains how to read **List Items** from a **SharePoint List** using --- -## 🔹 Supported Options for `list_items` +## Supported Options for `list_items` | Option | Description | Required | |--------------|-----------------------------------------------------------|----------| diff --git a/mkdocs.yml b/mkdocs.yml index 5b48030..c91a532 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,6 +13,7 @@ plugins: nav: - Home: index.md + - Getting Started: getting-started.md - Guides: - List Items: guides/list-items.md - Available Resource: api/resources/index.md From 8f65ad37b1f7da0d140a5e6ec63e542ec0d50e69 Mon Sep 17 00:00:00 2001 From: geekwhocodes Date: Wed, 5 Mar 2025 15:23:20 +0000 Subject: [PATCH 10/10] test workflow --- .github/workflows/test.yml | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..7a5ceba --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,40 @@ +name: Publish to Test PyPI + +on: + push: + branches: + - 'feature*' + +jobs: + test-and-publish: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Install dependencies + run: poetry install + + - name: Run tests + run: poetry run pytest + + - name: Build the package + run: poetry build + + - name: Publish to Test PyPI + env: + POETRY_PYPI_TOKEN_TESTPYPI: ${{ secrets.TEST_PYPI_TOKEN }} + run: | + poetry config repositories.testpypi https://test.pypi.org/legacy/ + poetry publish -r testpypi --build