# Define Logging and Handler

In [0]:
from datetime import datetime
from zoneinfo import ZoneInfo
import logging

# Configure the root logger’s level & format
logging.basicConfig(
    level=logging.INFO,
    format= "%(asctime)s %(levelname)s %(message)s", # https://docs.python.org/3/library/logging.html#logrecord-attributes
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Grab the root logger once for the whole module
root_logger = logging.getLogger()

# Define a custom ListHandler. Handler that appends each record to the target list.
class ListHandler(logging.Handler):
    def __init__(self, target_list):
        super().__init__() # under the hood invokes logging.Handler.__init__
        self.target = target_list

    def emit(self, record): # “record” is the LogRecord created by logging.info()/error()        
        utc_dt = datetime.fromtimestamp(
            record.created, # a UTC-based POSIX timestamp
            tz=ZoneInfo("UTC") # keep in UTC
        )

        # Append structured log entry
        self.target.append({
            "run_ts": utc_dt,
            "level": record.levelname,
            "message": record.getMessage()
        })

2025-07-03 18:40:05 INFO Error while sending or receiving.
Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 528, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer
2025-07-03 18:40:05 INFO Closing down clientserver connection
2025-07-03 18:40:05 INFO Exception while sending command.
Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 528, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/

# Define Functions

In [0]:
import os

def get_shopify_secret(
    shop: str           = "SHOPIFY_US",
    api_version: str    = "2023-04",
    scope: str          = "azure_key_vault",
    key: str            = "SHOPIFYUS-PW"
):
    """
    Fetch the Shopify secret from Azure Key Vault
    """
    # Define Shopify US private API connection
    shop_name = os.getenv(shop)
    shop_url = f'{shop_name}.myshopify.com'
    api_version = api_version # the lastest version that can be supported by ShopifyAPI.
    private_app_password = dbutils.secrets.get(scope=scope, key=key) # use Azure Key Vault to save this password.

    return shop_name, shop_url, api_version, private_app_password

import shopify
import json
import urllib.error

def fetch_all_pages(entity: str, query: str, session: shopify.Session, page_size: int = 10) -> list:
    """
    Paginate through Shopify GraphQL query, collecting all pages.
    """
    # Log the start    
    logging.info("Starting Shopify Session for '%s'", entity)

    # Activate the shopify session
    shopify.ShopifyResource.activate_session(session)

    all_edges = []
    cursor = None

    try:
        while True:
            variables = {"first": page_size}
            if cursor:
                variables["after"] = cursor

            try:
                # Execute GraphQL query and iterate to the end page
                raw = shopify.GraphQL().execute(query, variables=variables)
                result = json.loads(raw)
            except (urllib.error.HTTPError, ValueError) as e:
                logging.error("Error fetching/parsing %s page: %s", entity, e)
                break

            if "errors" in result:
                msg = result["errors"][0].get("message", "(no message)")
                logging.error("GraphQL error for %s: %s", entity, msg)
                break                
                
            # data = result["data"][entity]
            data = result.get("data", {}).get(entity)
            if data is None:
                logging.error("No data.%s in response", entity)
                break                

            nextPage = data["pageInfo"]["hasNextPage"]
            items = data["edges"]

            all_edges.extend(items)
            logging.info("Received %s records for %s", len(items), entity)

            if not nextPage:
                break
            cursor = data["pageInfo"]["endCursor"]

        logging.info("Received all the pages for %s", entity)

    finally:
        # Deactivate the shopify session
        shopify.ShopifyResource.clear_session()
        logging.info("Cleared Shopify Session for %s", entity)

    return [edge["node"] for edge in all_edges]

def write_to_lakehouse(
    data: list            =None, 
    partition_number: int =1, 
    medal: str            ="bronze",
    subfolder_path: str   ="shopify", 
    entity: str           ="",
    file_format: str      ="delta", 
    write_mode: str       ="overwrite", 
    merge_schema: bool    =True
):
    """
    Write a list of JSON records into ADLS and register a Delta table.
    """
    # Check whether the data is empty
    if not data:
        logging.info("No data for %s. Skip write.", entity)
        return
    
    # Build a RDD
    json_rdd = sc.parallelize([json.dumps(record) for record in data])

    # Read RDD as json
    df = spark.read.json(json_rdd)

    # Define the storage path
    path = f"abfss://{medal}@qydatalake.dfs.core.windows.net/{subfolder_path}/{entity}/"

    # Define the catalog name and table name
    subfolder_path = subfolder_path.strip("/")
    parts = subfolder_path.split("/")
    catalog_name = parts[0]
    if parts[1:]:
        postfix = "_".join(parts[1:])
        table_name = f"{entity}_{postfix}"
    else:
        table_name = entity
    
    # Write into Bronze lakehouse
    (
        df
        .coalesce(partition_number) # data is small (< 250 MB), collapse to one file to avoid metadata scan overhead
        .write
        .format(file_format)
        .mode(write_mode)
        .option("mergeSchema", merge_schema) # allows new columns to be added when the table already exists
        .option("path", path) # writes the Delta files into ADLS
        .saveAsTable(f"{catalog_name}.{medal}.{table_name}") # registers (or refreshes) the table in Unity Catalog on Databricks
    )

    logging.info("Wrote %d records of %s to %s", len(data), entity, path)

def fetch_entity_data(
    entity: str, query: str, shop_name: str, shop_url: str, api_version: str, password: str
):
    """
    Thread-safe wrapper: fetch items & details, capture logs, return data + logs.
        Child‐process worker:
        1) creates its own Shopify session,
        2) fetches all pages for `entity`,
        3) captures INFO‐level logs into a list,
        4) returns (shop_name, entity, records, logs).
    """
    # 1) spin up the session
    session = shopify.Session(shop_url, api_version, password)

    # 2) set up log capturing
    entity_logs = []
    list_handler = ListHandler(entity_logs)
    list_handler.setLevel(logging.INFO)
    root_logger.addHandler(list_handler)

    try:
        # 3) do the fetch by fetch_all_pages
        try:
            # Fetch full JSON for each “item” in every page (a list of dicts)
            records = fetch_all_pages(entity, query, session)
        except Exception:
            logging.exception("Unexpected error fetching %s", entity)
            records = []
    finally:
        # always remove the log‐capturing handler
        root_logger.removeHandler(list_handler)

    # 4) return everything—including shop_name so caller knows which folder to write to
    return shop_name, entity, records, entity_logs

2025-07-03 18:40:08 INFO Received command c on object id p1
2025-07-03 18:40:08 INFO Received command c on object id p0
2025-07-03 18:40:09 INFO Received command c on object id p0
2025-07-03 18:40:10 INFO Closing down clientserver connection
2025-07-03 18:40:10 INFO Closing down clientserver connection
2025-07-03 18:40:10 INFO Closing down clientserver connection
2025-07-03 18:40:10 INFO Closing down clientserver connection


In [0]:
from pyspark.sql.types import TimestampType, StructField, StructType, StringType
import pyspark.sql.functions as F

def log_progress(
    log_records: list     =None,
    medal: str            ="bronze",
    subfolder_path: str   ="shopify", 
    entity: str           =""
):
    """
    Persist captured logs into Delta under Files/{medal}/{subfolder_path}/logs/{entity}_log.
    """
    if not log_records:
        return

    # Define the log path
    log_path = f"abfss://{medal}@qydatalake.dfs.core.windows.net/{subfolder_path}/logs/{entity}_log"

    # Define the log schema
    schema = StructType([
        StructField("run_ts", TimestampType(), nullable=False),
        StructField("level", StringType(), nullable=False),
        StructField("message", StringType(), nullable=False)
    ])

    # Create the log dataframe with exact one file output
    log_df = spark.createDataFrame(log_records, schema).coalesce(1)

    # Write the log
    (
        log_df
        .write
        .format("delta")
        .mode("append")
        .option("mergeSchema", True)
        .save(log_path)
        # .option("path", log_path)
        # .saveAsTable(f"{subfolder_path}.{medal}.{entity}_log")
    )

    logging.info("Logged %d entries for %s", len(log_records), entity)

2025-07-03 18:40:10 INFO Received command c on object id p1
2025-07-03 18:40:10 INFO Error while sending or receiving.
Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 528, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer
2025-07-03 18:40:10 INFO Closing down clientserver connection
2025-07-03 18:40:10 INFO Received command c on object id p0
2025-07-03 18:40:10 INFO Exception while sending command.
Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 528, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, i

# Compose GraphQL Queries

In [0]:
# GraphQL: $ means variable name. The trailing ! means non‐nullable.
product_query = """
query getProducts($first: Int!, $after: String) {
  products(first: $first, after: $after) {
    edges {
      node {
        id
        title
        bodyHtml
        vendor
        productType
        createdAt
        handle
        updatedAt
        publishedAt
        templateSuffix
        tags
        variants(first: 250) {
          edges {
            node {
              id
              sku
              title
              price
              compareAtPrice
              image {
                url
              }
              inventoryQuantity
              inventoryItem {
                id
              }
              metafields(first: 250) {
                edges {
                  node {
                    id
                    namespace
                    key
                    value
                    ownerType
                  }
                }
              }
            }
          }
        }
        metafields(first: 250) {
          edges {
            node {
              id
              namespace
              key
              value
              ownerType
            }
          }
        }
        collections(first: 250) {
          edges {
            node {
              id
              handle
              title
              updatedAt
            }
          }
        }
      }
    }
    pageInfo {
      hasNextPage
      hasPreviousPage
      startCursor
      endCursor
    }
  }
}
"""

inventoryItems_query = """
query inventoryItems($first: Int!, $after: String)
{
  inventoryItems(first: $first, after: $after) {
    edges {
      node {
        id
        sku
        inventoryLevels(first: 250) {
          edges {
            node {
              location {
                id
                name
              }
              quantities(names: "available") {
                name
                quantity
              }
            }
          }
        }
      }
    }
    pageInfo {
      hasNextPage
      hasPreviousPage
      startCursor
      endCursor
    }
  }
}
"""

# 1. Fetch entities from Shopify by GraphQL
# 2. Write the output to Lakehouse
# 3. Log progress

In [0]:
from concurrent.futures import ProcessPoolExecutor, as_completed

# === MAIN PIPELINE ===
if __name__ == "__main__":
    # Define the shops (US, AU, etc.) up front:
    channels = ["US", "AU"]
    shop_configs = []

    for channel in channels: 
        # Prepare Shopify secret
        shop, shop_url, api_version, private_app_password = get_shopify_secret(
            shop=f"SHOPIFY_{channel}", 
            api_version="2023-04", 
            scope= "azure_key_vault", 
            key=f"SHOPIFY{channel}-PW"
        )

        shop_configs.append(
            {
                "name": shop, 
                "url": shop_url, 
                "version": api_version, 
                "password": private_app_password
            }
        )

    # Queries we want to load
    entity_queries = {
        "products": product_query, 
        "inventoryItems": inventoryItems_query
    }

    # Create a separate Shopify session for each entity. Each parallel job is isolated from the others.
    jobs = [
        (
            entity, 
            query,
            shop_config["name"],
            shop_config["url"], 
            shop_config["version"], 
            shop_config["password"]
        )
        for shop_config in shop_configs
        for entity, query in entity_queries.items()
    ]

    # Parallel GraphQL fetch (processes)
    with ProcessPoolExecutor(max_workers=len(entity_queries)*len(shop_configs)) as pool:
        futures = {
            pool.submit(fetch_entity_data, entity, query, name, url, version, password): name
            for entity, query, name, url, version, password in jobs 
        }
        
        for future in as_completed(futures): # futures is a Dict[Future, str]
            # channel = futures[future]
            channel, entity, data, logs = future.result()

            # Write to Lakehouse
            write_to_lakehouse(data=data, partition_number=1, medal="bronze", subfolder_path=f"shopify/{channel}", entity=entity)

            # Log progress
            log_progress(log_records=logs, medal="bronze", subfolder_path=f"shopify/{channel}", entity=entity)

# --- End of shopify_loader.py ---

2025-07-03 18:40:11 INFO Received command c on object id p1
2025-07-03 18:40:11 INFO Error while sending or receiving.
Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 528, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer
2025-07-03 18:40:11 INFO Closing down clientserver connection
2025-07-03 18:40:11 INFO Exception while sending command.
Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 528, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(comman