In [0]:
from time import sleep
import requests
import re
import json
from datetime import datetime, timezone
from requests_aws4auth import AWS4Auth
import urllib.parse
from aws_requests_auth.aws_auth import AWSRequestsAuth

dbutils.widgets.text('start', '')
dbutils.widgets.text('end', '')

start = dbutils.widgets.get('start')
end = dbutils.widgets.get('end')


A_KEY = 'AK...........2GE'
A_SECRET = 'Kz54x...................Bx'
ARN = 'arn:aws:iam::46.....117:role/ebzsellingapiRole'


C_KEY = 'amzn1.application-oa2-client.c895...............9f'
C_SECRET_KEY = 'amzn1.oa2-cs.v1.cbda929..........................494854e2614920a781a'
C_REFRESH_TOKEN = 'Atzr|IwEBID..............................lBJH88tgS9p'


MARKETPLACE_IDS = [
                'A1PA6795UKMFR9',  # DE
                'A1F83G8C2ARO7P',  # UK
                'A13V1IB3VIYZZH',  # FR  
                'A1RKKUPIHCS9HS',  # ES
                'APJ6JRA9NG5V4'    # IT
                ] 

In [0]:
# --- Authentication Functions ---

def get_access_token():
    """Gets the LWA access token using the refresh token."""
    global C_KEY, C_SECRET_KEY, C_REFRESH_TOKEN
    r = requests.post(f"https://api.amazon.com/auth/o2/token", data={
                        "grant_type":"refresh_token",
                        "refresh_token":C_REFRESH_TOKEN,
                        "client_id": C_KEY,
                        "client_secret": C_SECRET_KEY
                        })
    return r.json()['access_token']


def assume_role():
    """Assumes the IAM role and gets temporary AWS security credentials."""
    r = requests.get(f"https://sts.amazonaws.com?Version=2011-06-15&Action=AssumeRole&RoleSessionName=Test&RoleArn={ARN}&DurationSeconds=3600",
                    auth=AWS4Auth(A_KEY, A_SECRET, 'us-east-1', 'sts'))
    
    # Simple regex parsing (as in original code)
    access_key = re.findall('<AccessKeyId>(.+)</AccessKeyId>', r.text)[0]
    secret_key = re.findall('<SecretAccessKey>(.+)</SecretAccessKey>', r.text)[0]
    session_token = re.findall('<SessionToken>(.+)</SessionToken>', r.text)[0]
    
    return (access_key, secret_key, session_token)


def get_keys():
    """Fetches both LWA and AWS temporary credentials."""
    return get_access_token(), *assume_role()


access_token, access_key, secret_key, session_token = get_keys()
print('Got the access token : ' , access_token)

In [0]:
# Connect to Azure SQL Database
connection_string = f"jdbc:sqlserver://ebzreporting.database.windows.net:1433;database=dwh_reporting_prod;user=<USERNAME>@ebzreporting;password=<PASSWORD>;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;"

query = f"""
(
    SELECT DISTINCT order_id 
    FROM logistics_ft_orders 
    WHERE purchase_date >= '{start}' AND purchase_date <= '{end}'
) AS orders_filtered
"""

df_orders = spark.read.jdbc(
    url=connection_string,
    table=query
)

print(f"Found {df_orders.count()} orders between {start} and {end}")

Found 456 orders between 2025-11-17 and 2025-11-17


In [0]:
order_ids = [row.order_id for row in df_orders.select("order_id").collect()]

In [0]:
SP_API_REGION = "eu-west-1"
SP_API_HOST = "sellingpartnerapi-eu.amazon.com"
REQUESTS_PER_SECOND = 0.5
DELAY_BETWEEN_REQUESTS = 1 / REQUESTS_PER_SECOND  # 2 seconds between requests
MAX_RETRIES = 3
RETRY_DELAY = 60  # Wait 60 seconds before retrying after quota exceeded

In [0]:
def get_order_items_with_pagination(order_id, access_token, access_key, secret_key, session_token, retry_count=0):
    """
    Fetch all order items for an order, handling pagination if needed
    Includes rate limiting and retry logic for quota errors
    """
    global SP_API_REGION, SP_API_HOST, DELAY_BETWEEN_REQUESTS, MAX_RETRIES, RETRY_DELAY
    
    all_items = []
    next_token = None
    page = 1
    
    while True:
        print(f"  Fetching order {order_id}, page {page}")
        
        if next_token is None:
            rel_url = f"/orders/v0/orders/{order_id}/orderItems"
        else:
            rel_url = f"/orders/v0/orders/{order_id}/orderItems?NextToken={next_token}"
        
        url = f"https://{SP_API_HOST}{rel_url}"
        
        headers = {
            'x-amz-access-token': access_token,
            'Content-Type': 'application/json'
        }
        
        auth = AWS4Auth(
            access_key,
            secret_key,
            SP_API_REGION,
            'execute-api',
            session_token=session_token
        )
        
        try:
            response = requests.get(url, headers=headers, auth=auth)
            response.raise_for_status()
            
            result = response.json()
            
            # Check for quota exceeded error even in successful response
            if 'errors' in result:
                for error in result['errors']:
                    if error.get('code') == 'QuotaExceeded':
                        if retry_count < MAX_RETRIES:
                            print(f"  ⚠️ Quota exceeded for order {order_id}. Waiting {RETRY_DELAY} seconds before retry {retry_count + 1}/{MAX_RETRIES}...")
                            time.sleep(RETRY_DELAY)
                            return get_order_items_with_pagination(order_id, access_token, access_key, secret_key, session_token, retry_count + 1)
                        else:
                            return {
                                "orderId": order_id,
                                "status": "error",
                                "error": f"QuotaExceeded after {MAX_RETRIES} retries",
                                "timestamp": datetime.now(timezone.utc).isoformat()
                            }
            
            # Collect items from this page
            if 'payload' in result and 'OrderItems' in result['payload']:
                all_items.extend(result['payload']['OrderItems'])
            
            # Check for next page
            if 'payload' in result and 'NextToken' in result['payload']:
                next_token = result['payload']['NextToken']
                page += 1
                # Add delay between paginated requests
                time.sleep(DELAY_BETWEEN_REQUESTS)
            else:
                # No more pages
                break
                
        except requests.exceptions.RequestException as e:
            error_detail = str(e)
            try:
                if hasattr(e, 'response') and e.response is not None:
                    error_detail = e.response.text
                    # Check if it's a quota error
                    if 'QuotaExceeded' in error_detail and retry_count < MAX_RETRIES:
                        print(f"  ⚠️ Quota exceeded for order {order_id}. Waiting {RETRY_DELAY} seconds before retry {retry_count + 1}/{MAX_RETRIES}...")
                        time.sleep(RETRY_DELAY)
                        return get_order_items_with_pagination(order_id, access_token, access_key, secret_key, session_token, retry_count + 1)
            except:
                pass
            
            return {
                "orderId": order_id,
                "status": "error",
                "error": error_detail,
                "timestamp": datetime.now(timezone.utc).isoformat()
            }
    
    return {
        "orderId": order_id,
        "status": "success",
        "data": {
            "payload": {
                "OrderItems": all_items,
                "TotalPages": page
            }
        },
        "timestamp": datetime.now(timezone.utc).isoformat()
    }


In [0]:
all_order_items = []
failed_orders = []
start_time = time.time()

for idx, order_id in enumerate(order_ids):
    if (idx + 1) % 10 == 0:
        elapsed = time.time() - start_time
        avg_time_per_order = elapsed / (idx + 1)
        remaining_orders = len(order_ids) - (idx + 1)
        eta_seconds = remaining_orders * avg_time_per_order
        print(f"Progress: {idx + 1}/{len(order_ids)} orders | Elapsed: {elapsed/60:.1f}min | ETA: {eta_seconds/60:.1f}min")
    
    # Use pagination-aware function with rate limiting
    result = get_order_items_with_pagination(
        order_id, 
        access_token, 
        access_key, 
        secret_key, 
        session_token
    )
    
    all_order_items.append(result)
    
    if result["status"] == "error":
        failed_orders.append(order_id)
        print(f"  ❌ Error for order {order_id}: {result['error'][:100]}")
    else:
        print(f"  ✅ Successfully fetched order {order_id}")
    
    # Respect rate limit: wait 2 seconds between orders (0.5 req/sec)
    time.sleep(DELAY_BETWEEN_REQUESTS)

In [0]:

storageAccountName = "blobinventory"
containerName = "amazon-order-items"
sas = "sp=racwdli&st=2025-11-14T13:59:29Z&se=2029-04-30T22:14:2....................VE%3D"
config = "fs.azure.sas." + containerName+ "." + storageAccountName + ".blob.core.windows.net"
 
try:
    dbutils.fs.unmount("/mnt/" + containerName)
except:
    pass
    

dbutils.fs.mount(
  source = f"wasbs://{containerName}@{storageAccountName}.blob.core.windows.net",
  mount_point = "/mnt/" + containerName,
  extra_configs = {config : sas})


/mnt/amazon-order-items has been unmounted.
Out[44]: True

In [0]:
# Save data to DBFS
fileName = f'{start}__{end}_order_items'
dbfs_path = f'dbfs:/mnt/{containerName}/orders_items/{fileName}.json'
dbutils.fs.put(dbfs_path, json.dumps(all_order_items, indent=2), overwrite=True)

Wrote 629234 bytes.
Out[46]: True