diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..7a5ceba --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,40 @@ +name: Publish to Test PyPI + +on: + push: + branches: + - 'feature*' + +jobs: + test-and-publish: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Install dependencies + run: poetry install + + - name: Run tests + run: poetry run pytest + + - name: Build the package + run: poetry build + + - name: Publish to Test PyPI + env: + POETRY_PYPI_TOKEN_TESTPYPI: ${{ secrets.TEST_PYPI_TOKEN }} + run: | + poetry config repositories.testpypi https://test.pypi.org/legacy/ + poetry publish -r testpypi --build diff --git a/README.md b/README.md index 4ea60d4..a1c65d3 100644 --- a/README.md +++ b/README.md @@ -1,163 +1,137 @@ -# Apache PySpark Custom Data Source Template -This repository provides a template for creating a custom data source for Apache PySpark. It is designed to help developers extend PySpark’s data source API to support custom data ingestion and storage mechanisms. +# pyspark-msgraph-source +A **PySpark DataSource** to seamlessly integrate and read data from **Microsoft Graph API**, enabling easy access to resources like **SharePoint List Items**, and more. -## Motivation - -When developing custom PySpark data sources, I encountered several challenges that made the development process frustrating: - -1. **Environment Setup Complexity**: Setting up a development environment for PySpark data source development was unnecessarily complex, with multiple dependencies and version conflicts. - -2. **Test Data Management**: Managing test data and maintaining consistent test environments across different machines was challenging. - -3. **Debugging Issues**: The default setup made it difficult to debug custom data source code effectively, especially when dealing with Spark's distributed nature. - -4. **Documentation Gaps**: Existing documentation for custom data source development was scattered and often incomplete. - -This template repository aims to solve these pain points and provide a streamlined development experience. - +--- ## Features +- Entra ID Authentication +Securely authenticate with Microsoft Graph using DefaultAzureCredential, supporting local development and production seamlessly. -- Pre-configured development environment -- Ready-to-use test infrastructure -- Example implementation -- Automated tests setup -- Debug-friendly configuration +- Automatic Pagination Handling +Fetches all paginated data from Microsoft Graph without manual intervention. -## Getting Started +- Dynamic Schema Inference +Automatically detects the schema of the resource by sampling data, so you don't need to define it manually. -Follow these steps to set up and use this repository: +- Simple Configuration with .option() +Easily configure resources and query parameters directly in your Spark read options, making it flexible and intuitive. -### Prerequisites +- Zero External Ingestion Services +No additional services like Azure Data Factory or Logic Apps are needed—directly ingest data into Spark from Microsoft Graph. -- Docker -- Visual Studio Code -- Python 3.11 +- Extensible Resource Providers +Add custom resource providers to support more Microsoft Graph endpoints as needed. -### Creating a Repository from This Template +- Pluggable Architecture +Dynamically load resource providers without modifying core logic. -To create a new repository based on this template: +- Optimized for PySpark +Designed to work natively with Spark's DataFrame API for big data processing. -1. Go to the [GitHub repository](https://github.com/geekwhocodes/pyspark-custom-datasource-template). -2. Click the **Use this template** button. -3. Select **Create a new repository**. -4. Choose a repository name, visibility (public or private), and click **Create repository from template**. -5. Clone your new repository: +- Secure by Design +Credentials and secrets are handled using Azure Identity best practices, avoiding hardcoding sensitive data. - ```sh - git clone https://github.com/your-username/your-new-repository.git - cd your-new-repository - ``` +--- -### Setup +## Installation -1. **Open the repository in Visual Studio Code:** - - ```sh - code . - ``` - -2. **Build and start the development container:** - - Open the command palette (Ctrl+Shift+P) and select `Remote-Containers: Reopen in Container`. +```bash +pip install pyspark-msgraph-source +``` -3. **Initialize the environment:** +--- - The environment will be initialized automatically by running the `init-env.sh` script defined in the `devcontainer.json` file. +## ⚡ Quickstart -### Project Structure +### 1. Authentication -The project follows this structure: +This package uses [DefaultAzureCredential](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential). +Ensure you're authenticated: -``` -. -├── src/ -│ ├── fake_source/ # Default fake data source implementation -│ │ ├── __init__.py -│ │ ├── source.py # Implementation of the fake data source -│ │ ├── schema.py # Schema definitions (if applicable) -│ │ └── utils.py # Helper functions (if needed) -│ ├── tests/ # Unit tests for the custom data source -│ │ ├── __init__.py -│ │ ├── test_source.py # Tests for the data source -│ │ └── conftest.py # Test configuration and fixtures -├── .devcontainer/ # Development container setup files -│ ├── Dockerfile -│ ├── devcontainer.json -├── |── scripts -├── | ├── init-env.sh # Initialization script for setting up the environment -├── pyproject.toml # Project dependencies and build system configuration -├── README.md # Project documentation -├── LICENSE # License file +```bash +az login ``` -### Usage +Or set environment variables: +```bash +export AZURE_CLIENT_ID= +export AZURE_TENANT_ID= +export AZURE_CLIENT_SECRET= +``` -By default, this template includes a **fake data source** that generates mock data. You can use it as-is or replace it with your own implementation. +### 2. Example Usage -1. **Register the custom data source:** +```python +from pyspark.sql import SparkSession - ```python - from pyspark.sql import SparkSession - from fake_source.source import FakeDataSource +spark = SparkSession.builder \ +.appName("MSGraphExample") \ +.getOrCreate() - spark = SparkSession.builder.getOrCreate() - spark.dataSource.register(FakeDataSource) - ``` +from pyspark_msgraph_source.core.source import MSGraphDataSource +spark.dataSource.register(MSGraphDataSource) -2. **Read data using the custom data source:** +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.load() - ```python - df = spark.read.format("fake").load() - df.show() - ``` +df.show() -3. **Run tests:** +# with schema - ```sh - pytest - ``` +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.schema("id string, Title string") +.load() -### Customization +df.show() -To replace the fake data source with your own: +``` -1. **Rename the package folder:** +--- - ```sh - mv src/fake_source src/your_datasource_name - ``` +## Supported Resources -2. **Update imports in `source.py` and other files:** +| Resource | Description | +|--------------|-----------------------------| +| `list_items`| SharePoint List Items | +| *(more coming soon...)* | | - ```python - from your_datasource_name.source import CustomDataSource - ``` +--- -3. **Update `pyproject.toml` to reflect the new package name.** +## Development -4. **Modify the schema and options in `source.py` to fit your use case.** +Coming soon... -### References -1. [Microsoft Learn - PySpark custom data sources](https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources) +--- -### License +## Troubleshooting -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. +| Issue | Solution | +|---------------------------------|----------------------------------------------| +| `ValueError: resource missing` | Add `.option("resource", "list_items")` | +| Empty dataframe | Verify IDs, permissions, and access | +| Authentication failures | Check Azure credentials and login status | -### Contact +--- -For issues and questions, please use the GitHub Issues section. +## 📄 License +[MIT License](LICENSE) -### Need Help Setting Up a Data Intelligence Platform with Databricks? -If you need expert guidance on setting up a modern data intelligence platform using Databricks, we can help. Our consultancy specializes in: +--- -- Custom data source development for Databricks and Apache Spark -- Optimizing ETL pipelines for performance and scalability -- Data governance and security using Unity Catalog -- Building ML & AI solutions on Databricks +## 📚 Resources -🚀 [Contact us](https://www.linkedin.com/in/geekwhocodes/) for a consultation and take your data platform to the next level. +- [Microsoft Graph API](https://learn.microsoft.com/en-us/graph/overview) +- [DefaultAzureCredential](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential) diff --git a/docs/api/core/async-iterator.md b/docs/api/core/async-iterator.md new file mode 100644 index 0000000..986e6d1 --- /dev/null +++ b/docs/api/core/async-iterator.md @@ -0,0 +1,3 @@ +# Async To Sync Iterator + +::: pyspark_msgraph_source.core.async_iterator diff --git a/docs/api/core/client.md b/docs/api/core/client.md new file mode 100644 index 0000000..f8a05a2 --- /dev/null +++ b/docs/api/core/client.md @@ -0,0 +1,3 @@ +# Base Client + +::: pyspark_msgraph_source.core.base_client diff --git a/docs/api/core/models.md b/docs/api/core/models.md new file mode 100644 index 0000000..2d8f907 --- /dev/null +++ b/docs/api/core/models.md @@ -0,0 +1,3 @@ +# Core Models + +::: pyspark_msgraph_source.core.models diff --git a/docs/api/core/resource-provider.md b/docs/api/core/resource-provider.md new file mode 100644 index 0000000..a063ce1 --- /dev/null +++ b/docs/api/core/resource-provider.md @@ -0,0 +1,3 @@ +# Resouorce Provider + +::: pyspark_msgraph_source.core.resource_provider diff --git a/docs/api/core/source.md b/docs/api/core/source.md new file mode 100644 index 0000000..3b05041 --- /dev/null +++ b/docs/api/core/source.md @@ -0,0 +1,3 @@ +# Source + +::: pyspark_msgraph_source.core.source diff --git a/docs/api/core/utils.md b/docs/api/core/utils.md new file mode 100644 index 0000000..a51b054 --- /dev/null +++ b/docs/api/core/utils.md @@ -0,0 +1,3 @@ +# Utils + +::: pyspark_msgraph_source.core.utils diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..86888cd --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,14 @@ +# API Reference + +Welcome to the API Reference of `your_package`. + +Below are the available modules and submodules: + +## Core +- [Core Overview](core.md) + +## Utils +- [Utils Helpers](utils.md) + +## API Client +- [API Client](api_client.md) diff --git a/docs/api/resources/index.md b/docs/api/resources/index.md new file mode 100644 index 0000000..ef87d05 --- /dev/null +++ b/docs/api/resources/index.md @@ -0,0 +1,33 @@ + +# Available Resources + +This page lists the Microsoft Graph resources currently supported by the `pyspark-msgraph-source` connector. + +--- + +## Supported Resources + +| Resource Name | Description | Read more | +|---------------|-------------|------------------| +| `list_items` | Retrieves items from a SharePoint List | [Configuration](list-items.md) | + +--- + +## Adding New Resources + +Want to add support for more resources? +Check out the [Contributing Guide](contributing.md) to learn how to extend the connector! + +--- + +## Notes +- Resources may require specific Microsoft Graph API permissions. +- Pagination, authentication, and schema inference are handled automatically. + +--- + +## Request New Resources + +Is your desired resource not listed here? +Open an [issue](https://github.com/geekwhocodes/pyspark-msgraph-source/issues) to request it! + diff --git a/docs/api/resources/list-items.md b/docs/api/resources/list-items.md new file mode 100644 index 0000000..78b6c83 --- /dev/null +++ b/docs/api/resources/list-items.md @@ -0,0 +1,4 @@ +# Resource - List Items + + +::: pyspark_msgraph_source.resources.list_items diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..8288dc3 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,61 @@ +## Installation + +```bash +pip install pyspark-msgraph-source +``` + +--- + +### 1. Authentication + +This package uses [DefaultAzureCredential](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential). +Ensure you're authenticated: + +```bash +az login +``` + +Or set environment variables: +```bash +export AZURE_CLIENT_ID= +export AZURE_TENANT_ID= +export AZURE_CLIENT_SECRET= +``` + +### 2. Example Usage + +```python +from pyspark.sql import SparkSession + +spark = SparkSession.builder \ +.appName("MSGraphExample") \ +.getOrCreate() + +from pyspark_msgraph_source.core.source import MSGraphDataSource +spark.dataSource.register(MSGraphDataSource) + +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.load() + +df.show() + +# with schema + +df = spark.read.format("msgraph") \ +.option("resource", "list_items") \ +.option("site-id", "") \ +.option("list-id", "") \ +.option("top", 100) \ +.option("expand", "fields") \ +.schema("id string, Title string") +.load() + +df.show() + + +``` \ No newline at end of file diff --git a/docs/guides/list-items.md b/docs/guides/list-items.md new file mode 100644 index 0000000..778bb11 --- /dev/null +++ b/docs/guides/list-items.md @@ -0,0 +1,94 @@ +# Reading SharePoint List Items with PySpark + +This guide explains how to read **List Items** from a **SharePoint List** using the `pyspark-msgraph-source` connector and Microsoft Graph API. + +--- + +## Prerequisites +- Microsoft Entra (Azure AD) authentication set up with permissions to access SharePoint lists. +- Required Microsoft Graph API permissions: + - `Sites.Read.All` + - `Lists.Read` +- Installed `pyspark-msgraph-source` package. +- Initialized Spark session. + +--- + +## Supported Options for `list_items` + +| Option | Description | Required | +|--------------|-----------------------------------------------------------|----------| +| `resource` | Resource name (must be `"list_items"`) | ✅ Yes | +| `site-id` | The ID of the SharePoint site | ✅ Yes | +| `list-id` | The ID of the list within the SharePoint site | ✅ Yes | +| `top` | (Optional) Number of records to fetch | ❌ No | +| `expand` | (Optional) Related entities to expand (e.g., `"fields"`) | ❌ No | + +> **Note:** You can find `site-id` and `list-id` via Graph API explorer or SharePoint admin tools. + +--- + +## Example Usage + +```python +from pyspark_msgraph_source.core.source import MSGraphDataSource + +# Register the data source (typically required once) +spark.dataSource.register(MSGraphDataSource) + +# Read data from Microsoft Graph +df = spark.read.format("msgraph") \ + .option("resource", "list_items") \ + .option("site-id", "37d7dde8-0b6b-4b7c-a2fd-2e217f54a263") \ + .option("list-id", "5ecf26db-0161-4069-b763-856217415099") \ + .option("top", 111) \ + .option("expand", "fields") \ + .load() + +# Show the results +df.show() +``` + +--- + +## Explanation of Example +- **`spark.read.format("msgraph")`**: Use the Microsoft Graph connector. +- **`.option("resource", "list_items")`**: Specify the resource to fetch SharePoint list items. +- **`.option("site-id", "...")` and `.option("list-id", "...")`**: Provide the SharePoint site and list IDs. +- **`.option("top", 111)`**: Limit the number of records (optional). +- **`.option("expand", "fields")`**: Retrieve additional field details (optional). +- **`.load()`**: Execute the read operation. + +--- + +## Schema Inference +The connector automatically infers the schema by fetching a sample record from the API if you do not provide a schema. + +--- + +## Error Handling +- Missing or invalid `site-id` or `list-id` will raise a `ValueError`. +- API permission errors will raise authentication exceptions. +- Network or Microsoft Graph issues will raise clear, descriptive exceptions. + +--- + +## Notes +- Authentication is handled automatically via [**`DefaultAzureCredential`**](https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python#defaultazurecredential), supporting: + - Environment credentials + - Managed Identity + - Azure CLI login + - Visual Studio Code authentication + +- Use `.option("top", N)` to control the number of records retrieved for large datasets. +- To retrieve custom fields, include `.option("expand", "fields")`. + +--- + +## Troubleshooting + +| Issue | Solution | +|-----------------------------------------|-------------------------------------------------| +| `"resource is missing"` error | Ensure `.option("resource", "list_items")` | +| Empty dataframe | Check permissions and ensure valid IDs | +| `"Unsupported resource name"` error | Verify `"list_items"` is supported | \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..5752edb --- /dev/null +++ b/docs/index.md @@ -0,0 +1,81 @@ + +# Welcome to **PySpark Microsoft Graph Connector** + +Unlock seamless data access from **Microsoft Graph API** directly into **Apache Spark** using this connector designed for modern data pipelines. + +--- + +## Why Use This Connector? + +Working with Microsoft 365 data—such as SharePoint, Teams, Users, and Planner—has traditionally required intermediate services like Azure Data Factory, Logic Apps, or manual exports. With **`pyspark-msgraph-source`**, you can: + +- Authenticate securely with **Entra ID** using `DefaultAzureCredential` +- Query any supported Microsoft Graph resource directly in Spark +- Automatically handle **pagination**, **dynamic schema inference**, and **large datasets** +- Streamline analytics on Microsoft 365 data without extra infrastructure + +--- + +## What is Microsoft Graph? + +[Microsoft Graph](https://learn.microsoft.com/en-us/graph/overview) is the gateway to data and intelligence in Microsoft 365. It provides unified access to: + +- **Users** +- **Groups** +- **Calendars** +- **SharePoint Lists** +- **Teams Channels** +- **Planner Tasks** +- And much more! + +--- + +## What Can You Build? + +- Reporting and analytics on SharePoint Lists +- Business intelligence dashboards with Microsoft Teams activity +- Enterprise insights from Entra ID (Azure AD) +- And much more! + +--- + +## How Does It Work? + +1. Configure your Microsoft Entra (Azure AD) application. +2. Authenticate with `DefaultAzureCredential`. +3. Load data into Spark using `.read.format("msgraph")`. +4. Query, process, and analyze at scale. + +--- + +## Example + +```python +df = spark.read.format("msgraph") \ + .option("resource", "list_items") \ + .option("site-id", "") \ + .option("list-id", "") \ + .load() + +df.show() +``` + +--- + +## Ready to Get Started? + +- Check out the [Getting Started Guide](getting-started.md) +- Explore available [Resources](api/resources) +- Learn how to [Contribute](contributing.md) + +--- + +## Need Help? + +- Open an [issue](https://github.com/geekwhocodes/pyspark-msgraph-source/issues) +- Start a discussion with the community +- Submit feature requests and improvements + +--- + +Welcome aboard and happy querying! 🚀 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..c91a532 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,30 @@ +site_name: PySpark MSGraph Source +theme: + name: material + +plugins: + - search + - mkdocstrings: + handlers: + python: + paths: ["src/"] + options: + show_source: false + +nav: + - Home: index.md + - Getting Started: getting-started.md + - Guides: + - List Items: guides/list-items.md + - Available Resource: api/resources/index.md + - API Reference: + - Overview: api/index.md + - Core: + - Source: api/core/source.md + - Base Client: api/core/client.md + - Resource Provider: api/core/resource-provider.md + - Models: api/core/models.md + - Async Iterator: api/core/async-iterator.md + - Utils: api/core/utils.md + - Resources: + - List Items: api/resources/list-items.md diff --git a/poetry.lock b/poetry.lock index 6d15189..77f6987 100644 --- a/poetry.lock +++ b/poetry.lock @@ -240,6 +240,40 @@ msal = ">=1.30.0" msal-extensions = ">=1.2.0" typing-extensions = ">=4.0.0" +[[package]] +name = "babel" +version = "2.17.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, + {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, +] + +[package.extras] +dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""] + +[[package]] +name = "backrefs" +version = "5.8" +description = "A wrapper around re and regex that adds additional back references." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"}, + {file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"}, + {file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"}, + {file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"}, + {file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"}, + {file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"}, +] + +[package.extras] +extras = ["regex"] + [[package]] name = "black" version = "25.1.0" @@ -291,7 +325,7 @@ version = "2025.1.31" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, @@ -396,7 +430,7 @@ version = "3.4.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, @@ -514,7 +548,6 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["dev"] -markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -821,6 +854,24 @@ files = [ {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"}, ] +[[package]] +name = "ghp-import" +version = "2.1.0" +description = "Copy your docs directly to the gh-pages branch." +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, + {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, +] + +[package.dependencies] +python-dateutil = ">=2.8.1" + +[package.extras] +dev = ["flake8", "markdown", "twine", "wheel"] + [[package]] name = "googleapis-common-protos" version = "1.67.0" @@ -839,6 +890,21 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4 [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] +[[package]] +name = "griffe" +version = "1.6.0" +description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"}, + {file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"}, +] + +[package.dependencies] +colorama = ">=0.4" + [[package]] name = "grpcio" version = "1.70.0" @@ -1045,7 +1111,7 @@ version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, @@ -1197,6 +1263,24 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] +[[package]] +name = "jinja2" +version = "3.1.5" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, + {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + [[package]] name = "jupyter-client" version = "8.6.3" @@ -1257,6 +1341,77 @@ files = [ docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] testing = ["coverage", "pyyaml"] +[[package]] +name = "markupsafe" +version = "3.0.2" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, + {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, +] + [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -1284,6 +1439,18 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "mergedeep" +version = "1.3.4" +description = "A deep merge function for 🐍." +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"}, + {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"}, +] + [[package]] name = "microsoft-kiota-abstractions" version = "1.9.2" @@ -1398,6 +1565,157 @@ files = [ [package.dependencies] microsoft-kiota-abstractions = ">=1.9.2,<1.10.0" +[[package]] +name = "mkdocs" +version = "1.6.1" +description = "Project documentation with Markdown." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"}, + {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"}, +] + +[package.dependencies] +click = ">=7.0" +colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} +ghp-import = ">=1.0" +jinja2 = ">=2.11.1" +markdown = ">=3.3.6" +markupsafe = ">=2.0.1" +mergedeep = ">=1.3.4" +mkdocs-get-deps = ">=0.2.0" +packaging = ">=20.5" +pathspec = ">=0.11.1" +pyyaml = ">=5.1" +pyyaml-env-tag = ">=0.1" +watchdog = ">=2.0" + +[package.extras] +i18n = ["babel (>=2.9.0)"] +min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4) ; platform_system == \"Windows\"", "ghp-import (==1.0)", "importlib-metadata (==4.4) ; python_version < \"3.10\"", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"] + +[[package]] +name = "mkdocs-autorefs" +version = "1.4.0" +description = "Automatically link across pages in MkDocs." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "mkdocs_autorefs-1.4.0-py3-none-any.whl", hash = "sha256:bad19f69655878d20194acd0162e29a89c3f7e6365ffe54e72aa3fd1072f240d"}, + {file = "mkdocs_autorefs-1.4.0.tar.gz", hash = "sha256:a9c0aa9c90edbce302c09d050a3c4cb7c76f8b7b2c98f84a7a05f53d00392156"}, +] + +[package.dependencies] +Markdown = ">=3.3" +markupsafe = ">=2.0.1" +mkdocs = ">=1.1" + +[[package]] +name = "mkdocs-get-deps" +version = "0.2.0" +description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"}, + {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"}, +] + +[package.dependencies] +mergedeep = ">=1.3.4" +platformdirs = ">=2.2.0" +pyyaml = ">=5.1" + +[[package]] +name = "mkdocs-material" +version = "9.6.7" +description = "Documentation that simply works" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"}, + {file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"}, +] + +[package.dependencies] +babel = ">=2.10,<3.0" +backrefs = ">=5.7.post1,<6.0" +colorama = ">=0.4,<1.0" +jinja2 = ">=3.0,<4.0" +markdown = ">=3.2,<4.0" +mkdocs = ">=1.6,<2.0" +mkdocs-material-extensions = ">=1.3,<2.0" +paginate = ">=0.5,<1.0" +pygments = ">=2.16,<3.0" +pymdown-extensions = ">=10.2,<11.0" +requests = ">=2.26,<3.0" + +[package.extras] +git = ["mkdocs-git-committers-plugin-2 (>=1.1,<3)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"] +imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"] +recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +description = "Extension pack for Python Markdown and MkDocs Material." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31"}, + {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"}, +] + +[[package]] +name = "mkdocstrings" +version = "0.28.2" +description = "Automatic documentation from sources, for MkDocs." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "mkdocstrings-0.28.2-py3-none-any.whl", hash = "sha256:57f79c557e2718d217d6f6a81bf75a0de097f10e922e7e5e00f085c3f0ff6895"}, + {file = "mkdocstrings-0.28.2.tar.gz", hash = "sha256:9b847266d7a588ea76a8385eaebe1538278b4361c0d1ce48ed005be59f053569"}, +] + +[package.dependencies] +Jinja2 = ">=2.11.1" +Markdown = ">=3.6" +MarkupSafe = ">=1.1" +mkdocs = ">=1.4" +mkdocs-autorefs = ">=1.4" +mkdocs-get-deps = ">=0.2" +mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} +pymdown-extensions = ">=6.3" + +[package.extras] +crystal = ["mkdocstrings-crystal (>=0.3.4)"] +python = ["mkdocstrings-python (>=0.5.2)"] +python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] + +[[package]] +name = "mkdocstrings-python" +version = "1.16.2" +description = "A Python handler for mkdocstrings." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "mkdocstrings_python-1.16.2-py3-none-any.whl", hash = "sha256:ff7e719404e59ad1a72f1afbe854769984c889b8fa043c160f6c988e1ad9e966"}, + {file = "mkdocstrings_python-1.16.2.tar.gz", hash = "sha256:942ec1a2e0481d28f96f93be3d6e343cab92a21e5baf01c37dd2d7236c4d0bd7"}, +] + +[package.dependencies] +griffe = ">=0.49" +mkdocs-autorefs = ">=1.4" +mkdocstrings = ">=0.28.2" + [[package]] name = "msal" version = "1.31.1" @@ -1795,6 +2113,22 @@ files = [ {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] +[[package]] +name = "paginate" +version = "0.5.7" +description = "Divides large result sets into pages for easier browsing" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591"}, + {file = "paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945"}, +] + +[package.extras] +dev = ["pytest", "tox"] +lint = ["black"] + [[package]] name = "pandas" version = "2.2.3" @@ -2330,6 +2664,25 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pymdown-extensions" +version = "10.14.3" +description = "Extension pack for Python Markdown." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pymdown_extensions-10.14.3-py3-none-any.whl", hash = "sha256:05e0bee73d64b9c71a4ae17c72abc2f700e8bc8403755a00580b49a4e9f189e9"}, + {file = "pymdown_extensions-10.14.3.tar.gz", hash = "sha256:41e576ce3f5d650be59e900e4ceff231e0aed2a88cf30acaee41e02f063a061b"}, +] + +[package.dependencies] +markdown = ">=3.6" +pyyaml = "*" + +[package.extras] +extra = ["pygments (>=2.19.1)"] + [[package]] name = "pyspark" version = "4.0.0.dev2" @@ -2491,6 +2844,21 @@ files = [ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +[[package]] +name = "pyyaml-env-tag" +version = "0.1" +description = "A custom YAML tag for referencing environment variables in YAML files. " +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, + {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, +] + +[package.dependencies] +pyyaml = "*" + [[package]] name = "pyzmq" version = "26.2.1" @@ -2619,7 +2987,7 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, @@ -2758,7 +3126,7 @@ version = "2.3.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, @@ -2791,6 +3159,49 @@ platformdirs = ">=3.9.1,<5" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] +[[package]] +name = "watchdog" +version = "6.0.0" +description = "Filesystem events monitoring" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d1cdb490583ebd691c012b3d6dae011000fe42edb7a82ece80965b42abd61f26"}, + {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc64ab3bdb6a04d69d4023b29422170b74681784ffb9463ed4870cf2f3e66112"}, + {file = "watchdog-6.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c897ac1b55c5a1461e16dae288d22bb2e412ba9807df8397a635d88f671d36c3"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2"}, + {file = "watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860"}, + {file = "watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134"}, + {file = "watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e6f0e77c9417e7cd62af82529b10563db3423625c5fce018430b249bf977f9e8"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:90c8e78f3b94014f7aaae121e6b909674df5b46ec24d6bebc45c44c56729af2a"}, + {file = "watchdog-6.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e7631a77ffb1f7d2eefa4445ebbee491c720a5661ddf6df3498ebecae5ed375c"}, + {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c7ac31a19f4545dd92fc25d200694098f42c9a8e391bc00bdd362c5736dbf881"}, + {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9513f27a1a582d9808cf21a07dae516f0fab1cf2d7683a742c498b93eedabb11"}, + {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7a0e56874cfbc4b9b05c60c8a1926fedf56324bb08cfbc188969777940aef3aa"}, + {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e6439e374fc012255b4ec786ae3c4bc838cd7309a540e5fe0952d03687d8804e"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c"}, + {file = "watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2"}, + {file = "watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a"}, + {file = "watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680"}, + {file = "watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f"}, + {file = "watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282"}, +] + +[package.extras] +watchmedo = ["PyYAML (>=3.10)"] + [[package]] name = "wcwidth" version = "0.2.13" @@ -3012,4 +3423,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.12,<4" -content-hash = "55fe3e2bccd32c0c86b24ee1f86e76a6137c76af323fb7d50cc53b9b5d5ca1f3" +content-hash = "24c46e7ab41949a8b9dd45260a7c6725f13c1133550548e3010cf1bd30f5a2e6" diff --git a/pyproject.toml b/pyproject.toml index b497866..b37169b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,13 @@ [project] -name = "source-msgraph" +name = "pyspark_msgraph_source" version = "0.1.0" -description = "" +description = "Pyspark custom data source for Microsoft Graph APIs, including path and query parameters, with PySpark read examples." authors = [ {name = "geekwhocodes",email = "ganeshraskar@outlook.com"} ] readme = "README.md" +homepage = "https://github.com/geekwhocodes/pyspark-msgraph-source" +repository = "https://github.com/geekwhocodes/pyspark-msgraph-source" requires-python = ">=3.12,<4" dependencies = [ "pyspark (==4.0.0.dev2)", @@ -15,8 +17,10 @@ dependencies = [ ] [tool.poetry] -packages = [{include = "source_msgraph", from = "src"}] +packages = [{include = "pyspark_msgraph_source", from = "src"}] +[tool.poetry.extras] +list_items= [] [tool.poetry.group.dev.dependencies] pytest = "^8.3.4" @@ -31,6 +35,9 @@ grpcio-status = "^1.60.1" pandas = "^2.2.0" ipykernel = "^6.29.5" markdown = "^3.7" +mkdocs = "^1.6.1" +mkdocs-material = "^9.6.7" +mkdocstrings = {extras = ["python"], version = "^0.28.2"} [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] diff --git a/src/source_msgraph/__init__.py b/src/pyspark_msgraph_source/__init__.py similarity index 100% rename from src/source_msgraph/__init__.py rename to src/pyspark_msgraph_source/__init__.py diff --git a/src/pyspark_msgraph_source/core/__init__.py b/src/pyspark_msgraph_source/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/source_msgraph/async_interator.py b/src/pyspark_msgraph_source/core/async_iterator.py similarity index 94% rename from src/source_msgraph/async_interator.py rename to src/pyspark_msgraph_source/core/async_iterator.py index b2c121a..4216dd9 100644 --- a/src/source_msgraph/async_interator.py +++ b/src/pyspark_msgraph_source/core/async_iterator.py @@ -9,6 +9,8 @@ class AsyncToSyncIterator: """ Converts an async generator into a synchronous iterator while ensuring proper event loop handling. + + This is required because Microsoft Graph SDK for Python(https://github.com/microsoftgraph/msgraph-sdk-python) is async first. """ def __init__(self, async_gen: AsyncGenerator[Any, None]): diff --git a/src/pyspark_msgraph_source/core/base_client.py b/src/pyspark_msgraph_source/core/base_client.py new file mode 100644 index 0000000..ec4b162 --- /dev/null +++ b/src/pyspark_msgraph_source/core/base_client.py @@ -0,0 +1,160 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict +from msgraph import GraphServiceClient +from kiota_abstractions.base_request_configuration import RequestConfiguration +from msgraph.generated.models.o_data_errors.o_data_error import ODataError +from pyspark_msgraph_source.core.async_iterator import AsyncToSyncIterator +from pyspark_msgraph_source.core.models import BaseResource +from pyspark_msgraph_source.core.utils import get_python_schema, to_json, to_pyspark_schema + +from azure.identity import DefaultAzureCredential + + +class BaseResourceProvider(ABC): + """ + Abstract base class to handle fetching data from Microsoft Graph API and + provide schema extraction for resources. + """ + + def __init__(self, options: Dict[str, Any]): + """ + Initializes the resource provider with Graph client and options. + + This sets up the Microsoft Graph client using `DefaultAzureCredential`, + which automatically handles Azure Active Directory (AAD) authentication + by trying multiple credential types in a fixed order, such as: + + - Environment variables + - Managed Identity (for Azure-hosted environments) + - Azure CLI credentials + - Visual Studio Code login + - Interactive browser login (if applicable) + + This allows seamless local development and production deployments + without code changes to the authentication mechanism. + + See Also: + defaultazurecredential: + https://learn.microsoft.com/en-us/python/api/azure-identity/azure.identity.defaultazurecredential + + Args: + options (Dict[str, Any]): Connector options including authentication + details and resource configurations. + + Raises: + CredentialUnavailableError: If no valid credentials are found during + authentication. + """ + self.options = options + credentials = DefaultAzureCredential() + self.graph_client = GraphServiceClient(credentials=credentials) + + async def fetch_data(self): + """ + Asynchronously fetches data from Microsoft Graph API with automatic + pagination handling. + + Yields: + Any: Each record fetched from the API. + + Raises: + ValueError: If the resource query parameters cannot be instantiated. + AttributeError: If invalid query parameters are provided. + Exception: If a Graph API error occurs. + + Example: + async for record in provider.fetch_data(): + print(record) + """ + query_parameters_cls = self.resource.get_query_parameters_cls() + + if query_parameters_cls: + try: + query_parameters_instance = query_parameters_cls() + except TypeError as e: + raise ValueError(f"Failed to instantiate {query_parameters_cls.__name__}: {e}") + + if self.resource.query_params: + for k, v in self.resource.query_params.items(): + k = k.removeprefix("%24") + if hasattr(query_parameters_instance, k): + setattr(query_parameters_instance, k, v) + else: + raise AttributeError(f"{query_parameters_cls.__name__} has no attribute '{k}'") + + request_configuration = RequestConfiguration( + query_parameters=query_parameters_instance + ) + + try: + builder = self.resource.get_request_builder_cls()( + self.graph_client.request_adapter, + self.resource.resource_params + ) + items = await builder.get(request_configuration=request_configuration) + while True: + for item in items.value: + yield item + if not items.odata_next_link: + break + items = await builder.with_url(items.odata_next_link).get() + + except ODataError as e: + raise Exception(f"Graph API Error: {e.error.message}") + + def iter_records(self): + """ + Provides a synchronous iterator over records from the Microsoft Graph API. + + Returns: + Iterator[Any]: Synchronous iterator over the fetched records. + + Raises: + ValueError: If required credentials or resource parameters are missing. + Exception: If the API request fails. + + Example: + for record in provider.iter_records(): + print(record) + """ + async_gen = self.fetch_data() + return AsyncToSyncIterator(async_gen) + + def get_resource_schema(self) -> Dict[str, Any]: + """ + Retrieves the schema of a Microsoft Graph API resource by sampling a record. + + Returns: + Tuple[Dict[str, Any], StructType]: A tuple containing the sample record + and its corresponding PySpark schema. + + Raises: + ValueError: If no records are found or required options are missing. + Exception: If the API request fails. + + Example: + record, schema = provider.get_resource_schema() + """ + async_gen = self.fetch_data() + + try: + record = next(AsyncToSyncIterator(async_gen), None) + if not record: + raise ValueError(f"No records found for resource: {self.resource.resource_name}") + record = to_json(record) + schema = to_pyspark_schema(get_python_schema(record)) + return record, schema + + except StopIteration: + raise ValueError(f"No records available for {self.resource.resource_name}") + + @abstractmethod + def resource(self) -> BaseResource: + """ + Abstract property that must be implemented to provide the resource + configuration. + + Returns: + BaseResource: The resource definition to use for fetching data. + """ + ... diff --git a/src/source_msgraph/constants.py b/src/pyspark_msgraph_source/core/constants.py similarity index 100% rename from src/source_msgraph/constants.py rename to src/pyspark_msgraph_source/core/constants.py diff --git a/src/source_msgraph/models.py b/src/pyspark_msgraph_source/core/models.py similarity index 58% rename from src/source_msgraph/models.py rename to src/pyspark_msgraph_source/core/models.py index 1f5c046..7ee6357 100644 --- a/src/source_msgraph/models.py +++ b/src/pyspark_msgraph_source/core/models.py @@ -1,16 +1,31 @@ from dataclasses import dataclass import importlib import inspect +import logging import re from typing import Any, Dict -from source_msgraph.constants import MSGRAPH_SDK_PACKAGE +from pyspark_msgraph_source.core.constants import MSGRAPH_SDK_PACKAGE from urllib.parse import unquote from kiota_abstractions.base_request_builder import BaseRequestBuilder + @dataclass class BaseResource: - name: str # User friendly name for Spark reader - resource_name: str # Microsoft Graph leaf resource name + """ + Represents a resource from Microsoft Graph API, such as list_items, users, etc. + + Attributes: + name (str): User-friendly name for the Spark reader. + resource_name (str): Microsoft Graph leaf resource name (e.g., users, items). + request_builder_module (str): Module path of the request builder class from the MSGraph Python SDK. + query_params (Dict[str, Any], optional): Extracted query parameters from the URL template. + resource_params (Dict[str, Any], optional): Extracted path parameters from the URL template. + request_builder_cls_name (str, optional): PascalCase name of the request builder class. + request_builder_query_cls_name (str, optional): PascalCase name of the request builder's query parameters class. + """ + + name: str + resource_name: str request_builder_module: str query_params: Dict[str, Any] = None resource_params: Dict[str, Any] = None @@ -18,27 +33,42 @@ class BaseResource: request_builder_query_cls_name: str = None def __post_init__(self): + """ + Initializes derived attributes and parses the URL template. + + Raises: + ValueError: If the 'name' attribute is not provided. + """ if not self.name: raise ValueError("name is required") - + self.request_builder_cls_name = self._pascal_case(f"{self.resource_name}_request_builder") - #self.request_builder_cls = self.get_request_builder_cls() self.request_builder_query_cls_name = self._pascal_case(f"{self.resource_name}_request_builder_get_query_parameters") - #self.query_parameters_cls = self.get_query_parameters_cls() self.parse_url_template() - @classmethod def _pascal_case(cls, snake_str: str) -> str: """ - Converts snake_case to PascalCase. - Example: "items_request_builder" -> "ItemsRequestBuilder" + Converts a snake_case string to PascalCase. + + Args: + snake_str (str): The snake_case string to convert. + + Returns: + str: PascalCase formatted string. """ return "".join(word.title() for word in snake_str.split("_")) - + def get_query_parameters_cls(self): """ Retrieves the query parameters class from the request builder module. + + Returns: + Any: Query parameters class object. + + Raises: + ImportError: If the request builder module is not found. + AttributeError: If the required class is not found. """ try: module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") @@ -47,19 +77,24 @@ def get_query_parameters_cls(self): if not request_builder_cls or not issubclass(request_builder_cls, BaseRequestBuilder): raise AttributeError(f"{self.request_builder_cls_name} not found in {module.__name__}") - # Inspect the attributes to find the query parameters class - for attr in dir(request_builder_cls): if attr == self.request_builder_query_cls_name: return getattr(request_builder_cls, attr) - raise AttributeError(f"{self.request_builder_query_cls_name} not found in {module.__name__}") - + raise AttributeError(f"{self.request_builder_query_cls_name} not found in {module.__name__}") + except ModuleNotFoundError: raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") def get_request_builder_cls(self) -> BaseRequestBuilder: """ - Dynamically imports a module and finds the RequestBuilder class. + Dynamically imports a module and retrieves the request builder class. + + Returns: + BaseRequestBuilder: The request builder class. + + Raises: + ImportError: If the module is not found. + AttributeError: If the class is not valid. """ try: module = importlib.import_module(f"{MSGRAPH_SDK_PACKAGE}.{self.request_builder_module}") @@ -71,103 +106,116 @@ def get_request_builder_cls(self) -> BaseRequestBuilder: return cls except ImportError: raise ImportError(f"Module {self.request_builder_module} not found in {MSGRAPH_SDK_PACKAGE}") - + def get_request_builder_url_template(self): """ - Extracts the `url_template` by analyzing the source code of the class. + Extracts the URL template from the request builder class's __init__ method. + + Returns: + str: URL template string. + + Raises: + TypeError: If the URL template cannot be extracted. """ try: cls = self.get_request_builder_cls() if inspect.isclass(cls) and hasattr(cls, "__init__"): - # Extract the __init__ function source code init_source = inspect.getsource(cls.__init__) if "super().__init__(" in init_source: - lines = init_source.split("\n") - for line in lines: + for line in init_source.split("\n"): if "super().__init__(" in line: match = re.search(r'super\(\).__init__\s*\([^,]+,\s*"([^"]+)"', line) if match: - url_template = match.group(1).replace('"', "").replace("'", "") - return url_template - + return match.group(1).replace('"', "").replace("'", "") except TypeError: raise TypeError(f"Error extracting URL template from {cls.__name__}") def parse_url_template(self): """ - Parses the `url_template` string to extract path parameters and query parameters. + Parses the URL template to extract path and query parameters. + + Raises: + ValueError: If the URL template is not found. """ url_template = self.get_request_builder_url_template() if not url_template: raise ValueError("URL template not found in request builder class") - # Extract path parameters (decode %2Did → _id) path_parameters = [ unquote(match.group(1)).replace("%2D", "_") for match in re.finditer(r"\{([^?}]+)\}", url_template) if match.group(1).lower() != "+baseurl" ] - # Extract query parameters (decode %24expand → $expand) query_match = re.search(r"\{\?([^}]+)\}", url_template) query_parameters = ( [unquote(q).replace("%24", "$") for q in query_match.group(1).split(",")] if query_match else [] ) - self.resource_params = {k:None for k in path_parameters} + self.resource_params = {k: None for k in path_parameters} self.query_params = {qp.strip().replace("$", ""): None for qp in query_parameters} - def map_options_to_params(self, options: Dict[str, Any]) -> 'BaseResource': """ - Maps the provided options to either query parameters or resource parameters. + Maps provided options to valid query and resource parameters. + + Args: + options (Dict[str, Any]): User-provided options. + + Returns: + BaseResource: Updated instance with mapped parameters. - :param options: Dictionary of options provided by the user. - :param query_params: List of valid query parameter names. - :param resource_params: List of valid resource parameter names. - :return: A tuple (mapped_query_params, mapped_resource_params, invalid_params) + Raises: + ValueError: If required resource parameters are missing or extra parameters are provided. """ missing_params = [param for param in self.resource_params if param not in options] if missing_params: raise ValueError(f"Missing required resource parameters: {', '.join(missing_params)}") - # TODO: add max $top value validation. + if int(options.get("top", 1)) <= 100: + logging.warning("Setting a low `top` value in Microsoft Graph queries can cause high latency and increase throttling risk.") - mapped_query_params = {"%24"+k: v for k, v in options.items() if k in self.query_params} + mapped_query_params = {"%24" + k: v for k, v in options.items() if k in self.query_params} mapped_resource_params = {k.replace("-", "%2D"): v for k, v in options.items() if k in self.resource_params} - + invalid_params = {k: v for k, v in options.items() if k not in self.query_params and k not in self.resource_params} - - if len(invalid_params) > 0: + + if invalid_params: raise ValueError(f"Extra parameters {invalid_params} not allowed.") - + self.query_params = mapped_query_params self.resource_params = mapped_resource_params - + return self + GUID_PATTERN = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") @dataclass class ConnectorOptions: - """Options for Microsoft Graph API requests with strict resource_path validation.""" + """ + Options for Microsoft Graph API requests with strict credential validation. + + Attributes: + tenant_id (str): Azure tenant ID (GUID). + client_id (str): Azure client ID (GUID). + client_secret (str): Azure client secret. + """ tenant_id: str client_id: str client_secret: str - resource: BaseResource + def __post_init__(self): ... - + def _validate_credentials(self): - """Validates the format and presence of credentials.""" - if not self.tenant_id or not GUID_PATTERN.match(self.tenant_id): - raise ValueError("Invalid tenant_id: must be a valid GUID.") - - if not self.client_id or not GUID_PATTERN.match(self.client_id): - raise ValueError("Invalid client_id: must be a valid GUID.") - - if not self.client_secret or not isinstance(self.client_secret, str): - raise ValueError("Invalid client_secret: must be a non-empty string.") \ No newline at end of file + """ + Validates the format and presence of credentials. + + Raises: + ValueError: If any credential is invalid or missing. + """ + ... diff --git a/src/pyspark_msgraph_source/core/resource_provider.py b/src/pyspark_msgraph_source/core/resource_provider.py new file mode 100644 index 0000000..468e5d1 --- /dev/null +++ b/src/pyspark_msgraph_source/core/resource_provider.py @@ -0,0 +1,88 @@ +from functools import lru_cache +import importlib +import logging +import pkgutil +from typing import Dict, Type +from pyspark_msgraph_source.core.base_client import BaseResourceProvider + + +# @lru_cache(maxsize=10) +def load_resource_providers() -> Dict[str, Type[BaseResourceProvider]]: + """ + Dynamically loads all resource providers from the `resources` package. + + This function scans the `resources` subpackage of the current root package, + discovers all modules (excluding `base.py`), and imports any classes ending + with `ResourceProvider` that are subclasses of `BaseResourceProvider`. + + This allows dynamic discovery and registration of new resource providers + without requiring explicit imports. + + Returns: + Dict[str, Type[BaseResourceProvider]]: A dictionary mapping resource + names (module names) to their corresponding resource provider classes. + + Example: + providers = load_resource_providers() + print(providers.keys()) + """ + providers = {} + root_package = __package__.split('.')[0] + logging.debug(f"Current root package {root_package}.") + + package = f'{root_package}.resources' + resources_pkg = importlib.import_module(package) + + for _, name, _ in pkgutil.iter_modules(resources_pkg.__path__): + if name != 'base': # Skip the base module + try: + module = importlib.import_module(f'{package}.{name}') + for attr_name in dir(module): + if attr_name.endswith('ResourceProvider'): + provider_class = getattr(module, attr_name) + if (isinstance(provider_class, type) and + issubclass(provider_class, BaseResourceProvider) and + provider_class != BaseResourceProvider): + providers[name] = provider_class + except ImportError as e: + print(f"Warning: Could not load resource provider {name}: {e}") + + return providers + + +# @lru_cache(maxsize=10) +def get_resource_provider(resource_name: str, options: frozenset) -> BaseResourceProvider: + """ + Factory method to retrieve the appropriate resource provider based on its name. + + This function looks up the resource provider class registered in + `load_resource_providers()`, instantiates it with the provided options, + and returns the instance. + + Args: + resource_name (str): The name of the resource (typically the module name). + options (frozenset): A frozenset of key-value pairs representing the + configuration options for the provider. + + Returns: + BaseResourceProvider: An instance of the corresponding resource provider. + + Raises: + ValueError: If the requested resource name is not found in the + available providers. + + Example: + provider = get_resource_provider('users', frozenset({'tenant_id': 'xxx'}.items())) + for record in provider.iter_records(): + print(record) + """ + providers = dict(load_resource_providers()) + provider_class: BaseResourceProvider = providers.get(resource_name) + + if not provider_class: + available = ', '.join(providers.keys()) + raise ValueError( + f"Unsupported resource name: '{resource_name}'. " + f"Available resources: {available}" + ) + return provider_class(dict(options)) diff --git a/src/pyspark_msgraph_source/core/source.py b/src/pyspark_msgraph_source/core/source.py new file mode 100644 index 0000000..c5e85a6 --- /dev/null +++ b/src/pyspark_msgraph_source/core/source.py @@ -0,0 +1,128 @@ +import logging +from typing import Any, Dict, Iterator, Tuple, Union +from pyspark.sql.datasource import DataSource, DataSourceReader +from pyspark.sql.types import StructType +from pyspark_msgraph_source.core.base_client import BaseResourceProvider +from pyspark_msgraph_source.core.resource_provider import get_resource_provider + +# Reference: https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources + +logger = logging.getLogger(__name__) + + +class MSGraphDataSource(DataSource): + """ + A custom PySpark DataSource implementation to read data from Microsoft Graph API. + + This datasource uses dynamic resource providers to connect to different + Microsoft Graph resources based on the `resource` option. + + If schema inference is required, it fetches sample data to infer the schema. + + See Also: + Databricks PySpark DataSource API: + https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources + + Args: + options (Dict[str, Any]): Connector options, including the required + `resource` name and authentication parameters. + + Raises: + ValueError: If the `resource` option is missing. + + Example: + df = spark.read.format("msgraph") \ + .option("resource", "list_items") \ + .option("site-id", "") \ + .option("list-id", "") \ + .option("top", 999) \ + .option("expand", "fields") \ + .load() + + df.show() + """ + + def __init__(self, options: Dict[str, Any]): + self.resource_name = options.pop("resource", None) + if not self.resource_name: + raise ValueError("resource is missing, please provide a valid resource name.") + self.options = frozenset(options.items()) + + @classmethod + def name(cls) -> str: + """ + Returns the registered name of the DataSource. + + Returns: + str: The name of the DataSource, "msgraph". + """ + return "msgraph" + + def schema(self): + """ + Infers the schema of the Microsoft Graph resource. + + This will call the corresponding resource provider to fetch a sample + record and determine its schema. + + Returns: + StructType: The inferred schema of the resource. + """ + logger.info("Schema not provided, inferring from the source.") + resource_provider: BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + _, schema = resource_provider.get_resource_schema() + logger.debug(f"Inferred schema: {schema}") + return schema + + def reader(self, schema: StructType) -> "MSGraphDataSourceReader": + """ + Provides the DataSourceReader to read data. + + Args: + schema (StructType): The schema to apply to the records. + + Returns: + MSGraphDataSourceReader: The configured reader for this resource. + """ + return MSGraphDataSourceReader(self.resource_name, self.options, schema) + + +class MSGraphDataSourceReader(DataSourceReader): + """ + A DataSourceReader to fetch records from a Microsoft Graph resource. + + This reader uses the resource provider to iterate over records and + yields rows compatible with the provided schema. + + Args: + resource_name (str): The name of the Microsoft Graph resource. + options (frozenset): Connector options. + schema (Union[StructType, str]): The schema to apply to the records. + """ + + def __init__(self, resource_name: str, options: frozenset, schema: Union[StructType, str]): + self.schema: StructType = schema + self.options = options + self.resource_name = resource_name + + def read(self, partition) -> Union[Iterator[Tuple], Iterator["RecordBatch"]]: # type: ignore + """ + Reads records from the Microsoft Graph API. + + For each record fetched from the resource provider, it transforms + the record into a PySpark Row object matching the schema. + + Args: + partition: Unused in this implementation (for future partitioning support). + + Yields: + Row: A PySpark Row object for each record. + """ + from pyspark_msgraph_source.core.utils import to_json + from pyspark.sql import Row + + resource_provider: BaseResourceProvider = get_resource_provider(self.resource_name, self.options) + for row in resource_provider.iter_records(): + row = to_json(row) + row_data = {f.name: row.get(f.name, None) for f in self.schema.fields} + yield Row(**row_data) diff --git a/src/pyspark_msgraph_source/core/utils.py b/src/pyspark_msgraph_source/core/utils.py new file mode 100644 index 0000000..7f9d658 --- /dev/null +++ b/src/pyspark_msgraph_source/core/utils.py @@ -0,0 +1,134 @@ +from typing import Any, Dict, List, Union +from kiota_serialization_json.json_serialization_writer_factory import JsonSerializationWriterFactory +import json + +from pyspark.sql.types import ( + StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, + ArrayType, TimestampType, DateType, LongType, BinaryType, DecimalType, DataType +) + +from datetime import datetime, date +from decimal import Decimal + +# Convert to JSON using Kiota +writer_factory = JsonSerializationWriterFactory() + + +def to_json(value: Any) -> Dict[str, Any]: + """ + Serializes a Kiota serializable object to a JSON-compatible dictionary. + + Args: + value (Any): An object that implements the Kiota serialization interface. + + Returns: + dict: A dictionary representing the serialized JSON content. + """ + writer = writer_factory.get_serialization_writer("application/json") + value.serialize(writer) + return json.loads(writer.get_serialized_content().decode("utf-8")) + + +def get_python_schema( + obj: Any +) -> Union[str, Dict[str, Any], List[Any]]: + """ + Recursively extracts the schema from a Python object. + + Args: + obj (Any): The Python object (e.g., dict, list, int, str) to analyze. + + Returns: + Union[str, dict, list]: A nested schema representing the object's structure and field types. + - For dicts: a dict with key-value schemas. + - For lists: a list with the schema of the first element or "any" if empty. + - For primitives: a string indicating the type ("str", "int", etc.). + """ + if isinstance(obj, bool): + return "bool" + elif isinstance(obj, dict): + return {key: get_python_schema(value) for key, value in obj.items()} + elif isinstance(obj, list): + if obj: # Assume first element type (homogeneous lists) + return [get_python_schema(obj[0])] + return ["any"] # Empty lists default to "any" + elif isinstance(obj, str): + return "str" + elif isinstance(obj, int): + return "int" + elif isinstance(obj, float): + return "float" + elif isinstance(obj, datetime): + return "datetime" + elif isinstance(obj, date): + return "date" + elif isinstance(obj, Decimal): + return "decimal" + elif obj is None: + return "null" + return "unknown" # Fallback for unrecognized types + + +def to_pyspark_schema( + schema_dict: Dict[str, Any] +) -> StructType: + """ + Recursively converts a nested Python schema dictionary to a PySpark StructType schema. + + Args: + schema_dict (dict): A dictionary with field names as keys and data types as values, + where types are represented as strings (e.g., "str", "int", "bool"). + Nested dictionaries represent nested StructTypes. + + Returns: + StructType: A PySpark StructType schema reflecting the provided structure. + + Example: + Input: + {"name": "str", "age": "int", "scores": ["float"], "address": {"city": "str"}} + Output: + StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("scores", ArrayType(DoubleType()), True), + StructField("address", StructType([ + StructField("city", StringType(), True) + ]), True) + ]) + """ + type_mapping: Dict[str, DataType] = { + "str": StringType(), + "int": IntegerType(), + "float": DoubleType(), + "bool": BooleanType(), + "datetime": TimestampType(), + "date": DateType(), + "long": LongType(), + "binary": BinaryType(), + "decimal": DecimalType(38, 18), + "null": StringType(), + "unknown": StringType(), + } + + def convert_type(value: Any) -> DataType: + """ + Recursively converts type descriptors to PySpark data types. + + Args: + value (Any): The type descriptor (str, dict, list). + + Returns: + DataType: The corresponding PySpark data type. + """ + if isinstance(value, dict): # Nested structure + return StructType([StructField(k, convert_type(v), True) for k, v in value.items()]) + elif isinstance(value, list): # List of elements (assume first element type) + if not value: + return ArrayType(StringType()) # Default to list of strings if empty + return ArrayType(convert_type(value[0])) + return type_mapping.get(value, StringType()) # Default to StringType + + struct_fields: List[StructField] = [ + StructField(field, convert_type(dtype), True) for field, dtype in schema_dict.items() + ] + return StructType(struct_fields) diff --git a/src/pyspark_msgraph_source/resources/__init__.py b/src/pyspark_msgraph_source/resources/__init__.py new file mode 100644 index 0000000..5ee7122 --- /dev/null +++ b/src/pyspark_msgraph_source/resources/__init__.py @@ -0,0 +1 @@ +from .list_items import * # type: ignore \ No newline at end of file diff --git a/src/pyspark_msgraph_source/resources/list_items.py b/src/pyspark_msgraph_source/resources/list_items.py new file mode 100644 index 0000000..153b2c1 --- /dev/null +++ b/src/pyspark_msgraph_source/resources/list_items.py @@ -0,0 +1,59 @@ +from functools import cached_property +import logging +from typing import Dict + +from pyspark_msgraph_source.core.base_client import BaseResourceProvider +from pyspark_msgraph_source.core.models import BaseResource + +logger = logging.getLogger(__name__) + + +class ListItemsResourceProvider(BaseResourceProvider): + """ + Resource provider for fetching list items from Microsoft Graph API. + + See Also: + https://learn.microsoft.com/en-us/graph/api/listitem-list?view=graph-rest-1.0: + https://learn.microsoft.com/en-us/graph/api/listitem-list?view=graph-rest-1.0 + + + This provider handles the setup of the `list_items` resource, + configuring the request builder and mapping options to the required parameters. + + Args: + options (Dict[str, str]): Connector options, typically containing + site ID, list ID, and any query parameters. + + Example: + provider = ListItemsResourceProvider(options) + for record in provider.iter_records(): + print(record) + """ + + def __init__(self, options: Dict[str, str]): + """ + Initializes the ListItemsResourceProvider. + + Args: + options (Dict[str, str]): Connector options required to configure + the resource and authenticate requests. + """ + self.options = options + super().__init__(options) + + @cached_property + def resource(self) -> BaseResource: + """ + Returns the BaseResource configuration for list items. + + This sets up the request builder path and resource name + required to make API calls to retrieve list items. + + Returns: + BaseResource: Configured resource with mapped options. + """ + return BaseResource( + name="list_items", + resource_name="items", + request_builder_module="sites.item.lists.item.items.items_request_builder" + ).map_options_to_params(self.options) diff --git a/src/source_msgraph/client.py b/src/source_msgraph/client.py deleted file mode 100644 index b429073..0000000 --- a/src/source_msgraph/client.py +++ /dev/null @@ -1,103 +0,0 @@ -from msgraph import GraphServiceClient -from kiota_abstractions.base_request_configuration import RequestConfiguration -from msgraph.generated.models.o_data_errors.o_data_error import ODataError -from azure.identity import ClientSecretCredential -from source_msgraph.async_interator import AsyncToSyncIterator -from source_msgraph.models import ConnectorOptions -from source_msgraph.utils import get_python_schema, to_json, to_pyspark_schema -from typing import Dict, Any - -class GraphClient: - def __init__(self, options: ConnectorOptions): - """ - Initializes the fetcher with the Graph client, resource path, and query parameters. - - - :param options: Connector options. - """ - credentials = ClientSecretCredential(options.tenant_id, options.client_id, options.client_secret) - self.graph_client = GraphServiceClient(credentials=credentials) - self.options: ConnectorOptions = options - - - async def fetch_data(self): - """ - Fetches data from Microsoft Graph using the dynamically built request. - Handles pagination automatically. - """ - query_parameters_cls = self.options.resource.get_query_parameters_cls() - - if query_parameters_cls: - try: - query_parameters_instance = query_parameters_cls() # Ensure it can be instantiated without arguments - except TypeError as e: - raise ValueError(f"Failed to instantiate {query_parameters_cls.__name__}: {e}") - - if self.options.resource.query_params: - for k, v in self.options.resource.query_params.items(): - k = k.removeprefix("%24") - if hasattr(query_parameters_instance, k): - setattr(query_parameters_instance, k, v) # Set attributes dynamically - else: - raise AttributeError(f"{query_parameters_cls.__name__} has no attribute '{k}'") - - request_configuration = RequestConfiguration( - query_parameters=query_parameters_instance - ) - - try: - builder = self.options.resource.get_request_builder_cls()(self.graph_client.request_adapter, self.options.resource.resource_params) - items = await builder.get(request_configuration=request_configuration) - while True: - for item in items.value: - yield item - if not items.odata_next_link: - break - items = await builder.with_url(items.odata_next_link).get() - - except ODataError as e: - raise Exception(f"Graph API Error: {e.error.message}") - - -def iter_records(options: ConnectorOptions): - """ - Iterates over records from the Microsoft Graph API. - - :param options: Connector options containing authentication credentials and resource details. - :return: A synchronous iterator over the fetched data. - :raises ValueError: If any required credentials or resource parameters are missing. - :raises GraphAPIError: If the API request fails. - """ - fetcher = GraphClient(options) - async_gen = fetcher.fetch_data() - return AsyncToSyncIterator(async_gen) - - - -def get_resource_schema(options: ConnectorOptions) -> Dict[str, Any]: - """ - Retrieves the schema of a Microsoft Graph API resource by fetching a single record. - - :param options: Connector options containing authentication credentials and resource details. - :return: A dictionary representing the schema of the resource. - :raises ValueError: If no records are found or if required options are missing. - :raises GraphAPIError: If the API request fails. - """ - fetcher = GraphClient(options) - async_gen = fetcher.fetch_data() - - try: - record = next(AsyncToSyncIterator(async_gen), None) - if not record: - raise ValueError(f"No records found for resource: {options.resource.resource_name}") - record = to_json(record) - schema = to_pyspark_schema(get_python_schema(record)) - return record, schema - - except StopIteration: - raise ValueError(f"No records available for {options.resource.resource_name}") - -# Example usage -# options = ConnectorOptions(...) -# schema = get_resource_schema(options) -# print(json.dumps(schema, indent=2)) diff --git a/src/source_msgraph/generate_docs.py b/src/source_msgraph/generate_docs.py deleted file mode 100644 index 0903c63..0000000 --- a/src/source_msgraph/generate_docs.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -from urllib.parse import unquote -from source_msgraph.models import BaseResource -from source_msgraph.resources import RESOURCE_CONFIGS - -def generate_markdown(resource: BaseResource) -> str: - """ - Generates sophisticated markdown documentation for a given BaseResource. - """ - md_content = [f"# {resource.name.capitalize()} Resource", ""] - md_content.append(f"**Resource Name:** `{resource.name.lower()}`") - - - md_content.append("\n## Overview") - md_content.append(f"The `{resource.name}` resource provides a structured way to interact with Microsoft Graph API.") - md_content.append("This resource supports operations such as retrieval and filtering of data.") - - md_content.append("\n## Resource Parameters") - if len(resource.resource_params.keys()) > 0: - md_content.append("| Parameter | Type | Required | Description |") - md_content.append("|-----------|------|----------|-------------|") - for param in resource.resource_params or {}: - md_content.append(f"| `{unquote(param)}` | `str` | ✅ | Required path parameter for resource access. |") - else: - md_content.append(f"> No parameters required for `{resource.name.lower()}` resource.") - - - md_content.append("\n## Query Parameters") - if len(resource.query_params.keys()) > 0: - md_content.append("| Parameter | Type | Required | Description |") - md_content.append("|-----------|------|----------|-------------|") - for param in resource.query_params or {}: - md_content.append(f"| `{unquote(param)}` | `str` | ❌ | Optional query parameter to refine the API request. |") - else: - md_content.append(f">> No query parameters are required for `{resource.name.lower()}` resource.") - - md_content.append("---") - - md_content.append("Tip: Please refer [Microsoft Graph API]() documentation if you don't see a field. This can be resolved by provising `expand` option.") - - md_content.append("\n## Example Usage") - md_content.append("```python") - md_content.append("from source_msgraph.source import MSGraphDataSource") - md_content.append("spark.dataSource.register(MSGraphDataSource)") - md_content.append("") - md_content.append("# Read data using Microsoft Graph") - md_content.append("df = spark.read.format(\"msgraph\") ") - md_content.append(" .option(\"tenant_id\", tenant_id)") - md_content.append(" .option(\"client_id\", client_id)") - md_content.append(" .option(\"client_secret\", client_secret)") - md_content.append(f" .option(\"resource\", \"{resource.name}\")") - for param in resource.resource_params or {}: - md_content.append(f" .option(\"{param}\", \"\")") - for param in resource.query_params or {}: - md_content.append(f" .option(\"{param}\", \"\")") - md_content.append(" .schema(\"id string, eTag string\")") - md_content.append(" .load()") - md_content.append("") - md_content.append("df.show()") - md_content.append("```") - - return "\n".join(md_content) - -def generate_docs(output_dir: str = "docs"): - """ - Generates sophisticated markdown documentation for all configured resources. - """ - os.makedirs(output_dir, exist_ok=True) - - for config in RESOURCE_CONFIGS: - resource = BaseResource( - name=config["name"], - resource_name=config["resource_name"], - request_builder_module=config["request_builder_module"] - ) - - md_content = generate_markdown(resource) - file_path = os.path.join(output_dir, f"{resource.name}.md") - with open(file_path, "w", encoding="utf-8") as f: - f.write(md_content) - print(f"Generated documentation: {file_path}") - -if __name__ == "__main__": - generate_docs() \ No newline at end of file diff --git a/src/source_msgraph/resources.py b/src/source_msgraph/resources.py deleted file mode 100644 index 43e5b08..0000000 --- a/src/source_msgraph/resources.py +++ /dev/null @@ -1,30 +0,0 @@ -# Define the resources to generate -from source_msgraph.models import BaseResource - - -RESOURCE_CONFIGS = [ - {"name": "sites", "resource_name": "sites", "request_builder_module": "sites.sites_request_builder"}, - {"name": "lists", "resource_name": "lists", "request_builder_module": "sites.item.lists.lists_request_builder"}, - {"name": "list_items", "resource_name": "items", "request_builder_module": "sites.item.lists.item.items.items_request_builder"}, -] - - - -def get_resource(name: str): - """ - Generates a list of BaseResource instances for specified Microsoft Graph resources. - """ - config = next((config for config in RESOURCE_CONFIGS if config["name"] == name), None) - if not config: - raise ValueError(f"Resource '{name}' is not supported yet. stay tuned!") - - # Create and store the BaseResource instance - resource = BaseResource( - name=config["name"], - resource_name=config["resource_name"], - request_builder_module=config["request_builder_module"] - ) - return resource - - - diff --git a/src/source_msgraph/source.py b/src/source_msgraph/source.py deleted file mode 100644 index 7366e3d..0000000 --- a/src/source_msgraph/source.py +++ /dev/null @@ -1,63 +0,0 @@ -import logging -from typing import Any, Dict, Union -from pyspark.sql.datasource import DataSource, DataSourceReader -from pyspark.sql.types import StructType -from source_msgraph.client import get_resource_schema, iter_records -from source_msgraph.models import ConnectorOptions - -from source_msgraph.resources import get_resource -# Reference https://learn.microsoft.com/en-us/azure/databricks/pyspark/datasources - -logger = logging.getLogger(__name__) - -class MSGraphDataSource(DataSource): - """ - - """ - def __init__(self, options: Dict[str, Any]): - - tenant_id=options.pop("tenant_id") - client_id=options.pop("client_id") - client_secret=options.pop("client_secret") - - resource_name = options.pop("resource") - if not resource_name: - raise ValueError("resource is missing, please provide a valid resource name.") - - resource = get_resource(resource_name).map_options_to_params(options) - - self.connector_options: ConnectorOptions = ConnectorOptions( - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret, - resource=resource - ) - - - @classmethod - def name(cls): - return "msgraph" - - def schema(self): - logger.info("Schema not provided, infering from the source.") - _, schema = get_resource_schema(self.connector_options) - logger.debug(f"Infered schema : {schema}") - return schema - - def reader(self, schema: StructType): - return MSGraphDataSourceReader(self.connector_options, schema) - - -class MSGraphDataSourceReader(DataSourceReader): - - def __init__(self, options: ConnectorOptions, schema: Union[StructType, str]): - self.schema: StructType = schema - self.options:ConnectorOptions = options - - def read(self, partition): - from source_msgraph.utils import to_json - from pyspark.sql import Row - for row in iter_records(self.options): - row = to_json(row) - row_data = {f.name: row.get(f.name, None) for f in self.schema.fields} - yield Row(**row_data) diff --git a/src/source_msgraph/utils.py b/src/source_msgraph/utils.py deleted file mode 100644 index b878c2a..0000000 --- a/src/source_msgraph/utils.py +++ /dev/null @@ -1,91 +0,0 @@ -from typing import Any -from kiota_serialization_json.json_serialization_writer_factory import JsonSerializationWriterFactory -import json - -from pyspark.sql.types import ( - StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, - MapType, ArrayType, TimestampType, DateType, LongType, BinaryType, DecimalType -) - -from datetime import datetime, date -from decimal import Decimal - -# Convert to JSON using Kiota -writer_factory = JsonSerializationWriterFactory() -writer = writer_factory.get_serialization_writer("application/json") - -def to_json(value): - value.serialize(writer) - # Get JSON string - return json.loads((writer.get_serialized_content().decode("utf-8"))) - -def to_jsonValue(value): - value.serialize(writer) - # Get JSON string - return str(json.loads((writer.get_serialized_content().decode("utf-8")))) - - - -def get_python_schema(obj:Any): - """ - Recursively extracts the schema from a Python object. - - :param obj: The Python object (dict, list, int, str, etc.). - :return: A schema dictionary representing field types. - """ - if isinstance(obj, bool): - return "bool" - elif isinstance(obj, dict): - return {key: get_python_schema(value) for key, value in obj.items()} - elif isinstance(obj, list): - if obj: # Assume first element type (homogeneous lists) - return [get_python_schema(obj[0])] - return ["any"] # Empty lists default to "any" - elif isinstance(obj, str): - return "str" - elif isinstance(obj, int): - return "int" - elif isinstance(obj, float): - return "float" - elif isinstance(obj, datetime): - return "datetime" - elif isinstance(obj, date): - return "date" - elif isinstance(obj, Decimal): - return "decimal" - elif obj is None: - return "null" - return "unknown" # Fallback for unrecognized types - -def to_pyspark_schema(schema_dict): - """ - Recursively converts a nested Python schema dictionary to a PySpark StructType schema. - - :param schema_dict: Dictionary with field names as keys and data types as values. - :return: PySpark StructType schema. - """ - type_mapping = { - "str": StringType(), - "int": IntegerType(), - "float": DoubleType(), - "bool": BooleanType(), - "datetime": TimestampType(), - "date": DateType(), - "long": LongType(), - "binary": BinaryType(), - "decimal": DecimalType(38, 18), - "unknown": StringType() - } - - def convert_type(value): - """Recursively converts types, handling nested dicts and lists.""" - if isinstance(value, dict): # Nested structure - return StructType([StructField(k, convert_type(v), True) for k, v in value.items()]) - elif isinstance(value, list): # List of elements (assume first element type) - if not value: - return ArrayType(StringType()) # Default to list of strings if empty - return ArrayType(convert_type(value[0])) - return type_mapping.get(value, StringType()) # Default to StringType - - struct_fields = [StructField(field, convert_type(dtype), True) for field, dtype in schema_dict.items()] - return StructType(struct_fields) \ No newline at end of file diff --git a/tests/test_source.py b/tests/test_source.py index 17cd168..777690e 100644 --- a/tests/test_source.py +++ b/tests/test_source.py @@ -1,6 +1,6 @@ import pytest from pyspark.sql import SparkSession -from source_msgraph.source import FakeDataSource +from source_pyspark_msgraph.source import FakeDataSource # @pytest.fixture