getml · cyclux · Dec 5, 2025 · Copilot · Dec 5, 2025 · Copilot
diff --git a/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md b/integration/jaffle-shop-data/GENERATE_JAFFLE_SHOP_PARQUET.md
@@ -0,0 +1,37 @@
+# Generate Parquet files from Jaffle Shop CSV data
+
+## Prerequisites
+
+- pipx
+- gcloud CLI
- pipx
- gcloud CLI
+- `pipx`
+- `gcloud CLI`
- gcloud CLI
+- `gcloud` CLI
- pipx
- gcloud CLI
+- `pipx`
+- `gcloud CLI`
- gcloud CLI
+- `gcloud` CLI
+
+This script reads the Jaffle Shop CSV files and converts them to Parquet format for more efficient storage and querying in Snowflake.
+
+## Generate Jaffle Shop Data (CSV)
+
+To generate the Jaffle Shop CSV data, run the following command:
+
+```bash
+pipx run jafgen 6
+```
+
+This will create the necessary CSV files in the `jaffle-data` directory.
+
+## Convert CSV to Parquet
+
+To convert the generated CSV files to Parquet format, run the following script:
+
+```bash
+python convert_jaffle_csv_to_parquet.py
+```
+
+This will read each CSV file from the `jaffle-data` directory and save the corresponding Parquet files in the `jaffle-data/parquet` directory.
+
+## Upload Parquet Files to GCP
+
+To upload the Parquet files to your GCP bucket, use the following commands:
+
+```bash
+gcloud config set project getml-infra
+gcloud storage cp jaffle-data/parquet/*.parquet gs://static.getml.com/datasets/jaffle_shop/
+```
diff --git a/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py b/integration/jaffle-shop-data/convert_jaffle_csv_to_parquet.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+
+import pandas as pd
+
+NAMES: list[str] = [
+    "raw_customers",
+    "raw_items",
+    "raw_orders",
+    "raw_products",
+    "raw_stores",
+    "raw_supplies",
+    "raw_tweets",
+]
+
+JAFFLE_CSV_DATA_PATH = Path("jaffle-data")
+
+if not JAFFLE_CSV_DATA_PATH.exists():
+    raise FileNotFoundError(
+        f"Jaffle CSV data path {JAFFLE_CSV_DATA_PATH} does not exist."
+        " Please run `jafgen` to generate CSVs."
-        " Please run `jafgen` to generate CSVs."
+        " Please run `pipx run jafgen 6` to generate CSVs."
-        " Please run `jafgen` to generate CSVs."
+        " Please run `pipx run jafgen 6` to generate CSVs."
+    )
+
+JAFFLE_PARQUET_DATA_PATH = JAFFLE_CSV_DATA_PATH / "parquet"
+Path.mkdir(JAFFLE_PARQUET_DATA_PATH, exist_ok=True)
-Path.mkdir(JAFFLE_PARQUET_DATA_PATH, exist_ok=True)
+JAFFLE_PARQUET_DATA_PATH.mkdir(parents=True, exist_ok=True)
-Path.mkdir(JAFFLE_PARQUET_DATA_PATH, exist_ok=True)
+JAFFLE_PARQUET_DATA_PATH.mkdir(parents=True, exist_ok=True)
+
+
+for name in NAMES:
+    csv_filepath = JAFFLE_CSV_DATA_PATH / f"{name}.csv"
+    parquet_filepath = JAFFLE_PARQUET_DATA_PATH / f"{name}.parquet"
+    print(f"Loading {csv_filepath}...")
+
+    # 1. Read CSV into memory
+    df: pd.DataFrame = pd.read_csv(csv_filepath)
+
+    # 2. Write DataFrame to Parquet
+    # 'index=False' prevents pandas from adding an extra index column
+    df.to_parquet(parquet_filepath, index=False)
+
+    print(f"Converted {name} to parquet format at {parquet_filepath}.")
diff --git a/integration/pyproject.toml b/integration/pyproject.toml
@@ -0,0 +1,137 @@
+[project]
+name = "getml-featurestore-integrations"
+version = "0.1.0"
+description = "Integrations and Data Preparation for getML Feature Stores"
+authors = [
+    { name = "Code17 GmbH", email = "hello@code17.io" },
+    { name = "getML", email = "hello@getml.com" },
+]
+maintainers = [
+    { name = "Code17 GmbH", email = "hello@code17.io" },
+    { name = "getML", email = "hello@getml.com" },
+]
+license = { text = "Proprietary" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Operating System :: OS Independent",
+    "Private :: Do Not Upload",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+readme = "README.md"
-readme = "README.md"
-readme = "README.md"
+requires-python = ">=3.12"
+
+dependencies = [
+    "fastparquet>=2024.11.0",
+    "httpx>=0.27.0",
+    "ipykernel>=7.1.0",
+    "pandas>=2.3.3",
+    "pyarrow>=18.0.0",
+    "pydantic>=2.12.5",
+    "pydantic-settings>=2.12.0",
+    "snowflake-connector-python>=3.17.3",
+    "snowflake-snowpark-python>=1.42.0",
+]
+
+[dependency-groups]
+dev = [
+    "ruff~=0.12.2",
+    "basedpyright~=1.28.4",
+    "pytest~=8.0.0",
+    "pytest-cov>=6.2.1",
+    "pytest-dependency>=0.6.0",
+]
+
+[tool.uv]
+package = false
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+markers = [
+    "integration: marks tests as integration tests (require Snowflake credentials)",
+]
+
+[project.urls]
+"Homepage" = "https://github.com/getml/getml-demo"
+"Bug Tracker" = "https://github.com/getml/getml-demo/issues"
+"getML" = "https://getml.com"
+"Code17 GmbH" = "https://www.code17.io/"
+
+[tool.pyright]
+venvPath = "."
+venv = ".venv"
+reportMissingTypeStubs = false
+reportImplicitStringConcatenation = false
+
+[[tool.pyright.executionEnvironments]]
+root = "tests"
+extraPaths = ["."]
+reportUnusedParameter = false
+
+[build-system]
+requires = ["uv_build>=0.7.21,<0.8.0"]
+build-backend = "uv_build"
+
+[tool.ruff]
+line-length = 88
+target-version = "py312"
+
+[tool.ruff.format]
+preview = false
+quote-style = "double"
+line-ending = "auto"
+docstring-code-format = true
+
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    # Allow for string literals in exceptions
+    "EM",
+    # Allow missing copyright notice at top of files
+    "CPY001",
+    # Allow missing docstrings in public modules
+    "D100",
+    # Allow missing docstrings in public classes
+    "D101",
+    # Allow missing docstrings in public packages
+    "D104",
+    # Allow docstrings without blank line before class docstring
+    "D203",
+    # Allow multi-line docstring summary to start at second line
+    "D213",
+    # Allow first-party imports outside type-checking blocks
+    "TC001",
+    # Allow third-party imports outside type-checking blocks
+    "TC002",
+    # Allow standard library imports outside type-checking blocks
+    "TC003",
+    # Allow TODO comments  
-    # Allow TODO comments  
+    # Allow TODO comments
-    # Allow TODO comments  
+    # Allow TODO comments
+    "FIX002",
+    # Allow TODO comments without author
+    "TD002",
+    # Allow TODO comments without link to issue
+    "TD003",
+    # Allow specifying long messages outside the exception class
+    "TRY003",
+    # Conflicts with formatter - trailing commas are handled by ruff format
+    "COM812",
+]
+
+fixable = ["ALL"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.per-file-ignores]
+# S101: Allow for use of the assert keyword
+# PLR2004: Allow "magic value" used in comparison
+"test_*.py" = ["S101", "PLR2004"]