In [0]:
from time import sleep
import pandas as pd
import requests
import re
import json
from datetime import datetime, timezone
import time
from requests_aws4auth import AWS4Auth
import urllib.parse
from aws_requests_auth.aws_auth import AWSRequestsAuth


dbutils.widgets.text('start', '')
dbutils.widgets.text('end', '')

start = dbutils.widgets.get('start')
end = dbutils.widgets.get('end')


A_KEY = 'AKIA........2GE'
A_SECRET = 'Kz54............x'
ARN = 'arn:aws:iam::466....17:role/ebzsellingapiRole'


C_KEY = 'amzn1.application-oa2-client.c89........e69f'
C_SECRET_KEY = 'amzn1.oa2-cs.v1.cbda92927ac...............a781a'
C_REFRESH_TOKEN = 'Atzr|IwEBIDuBz3QTjlBJH8.......yGrzABTI-Lx2L-HM'


MARKETPLACE_IDS = [
                'A1PA6795UKMFR9',  # DE
                'A1F83G8C2ARO7P',  # UK
                'A13V1IB3VIYZZH',  # FR  
                'A1RKKUPIHCS9HS',  # ES
                'APJ6JRA9NG5V4'    # IT
                ] 

In [0]:
# --- Authentication Functions ---

def get_access_token():
    """Gets the LWA access token using the refresh token."""
    global C_KEY, C_SECRET_KEY, C_REFRESH_TOKEN
    r = requests.post(f"https://api.amazon.com/auth/o2/token", data={
                        "grant_type":"refresh_token",
                        "refresh_token":C_REFRESH_TOKEN,
                        "client_id": C_KEY,
                        "client_secret": C_SECRET_KEY
                        })
    return r.json()['access_token']


def assume_role():
    """Assumes the IAM role and gets temporary AWS security credentials."""
    r = requests.get(f"https://sts.amazonaws.com?Version=2011-06-15&Action=AssumeRole&RoleSessionName=Test&RoleArn={ARN}&DurationSeconds=3600",
                    auth=AWS4Auth(A_KEY, A_SECRET, 'us-east-1', 'sts'))
    
    # Simple regex parsing (as in original code)
    access_key = re.findall('<AccessKeyId>(.+)</AccessKeyId>', r.text)[0]
    secret_key = re.findall('<SecretAccessKey>(.+)</SecretAccessKey>', r.text)[0]
    session_token = re.findall('<SessionToken>(.+)</SessionToken>', r.text)[0]
    
    return (access_key, secret_key, session_token)


def get_keys():
    """Fetches both LWA and AWS temporary credentials."""
    return get_access_token(), *assume_role()


access_token, access_key, secret_key, session_token = get_keys()
print('Got the access token : ' , access_token)



def refresh_credentials_if_needed(last_refresh_time, token_refresh_interval=1800):
    """
    Check if credentials need refresh based on elapsed time
    
    Args:
        last_refresh_time: timestamp of last token refresh
        token_refresh_interval: refresh interval in seconds (default: 1800 = 30 minutes)
    
    Returns:
        tuple: (access_token, access_key, secret_key, session_token, new_refresh_time)
    """
    current_time = time.time()
    elapsed_since_refresh = current_time - last_refresh_time
    
    if elapsed_since_refresh >= token_refresh_interval:
        print(f"\n🔄 Refreshing credentials (elapsed: {elapsed_since_refresh/60:.1f} minutes)...")
        access_token, access_key, secret_key, session_token = get_keys()
        print(f"✅ Credentials refreshed successfully")
        return access_token, access_key, secret_key, session_token, current_time
    
    # Return None to indicate no refresh needed
    return None, None, None, None, last_refresh_time

In [0]:
# Create a date range list
start_date = datetime.strptime(start, "%Y-%m-%d")
end_date = datetime.strptime(end, "%Y-%m-%d")

date_list = pd.date_range(start_date, end_date, freq='D')

date_list = date_list.strftime("%Y-%m-%d")
print(date_list)

In [0]:
# Connection details to Azure SQL Database
connection_string = f"jdbc:sqlserver://ebzreporting.database.windows.net:1433;database=<databasename>;user=<username>@ebzreporting;password=<password>;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;"

SP_API_REGION = "eu-west-1"
SP_API_HOST = "sellingpartnerapi-eu.amazon.com"
REQUESTS_PER_SECOND = 0.5
DELAY_BETWEEN_REQUESTS = 1 / REQUESTS_PER_SECOND  # 2 seconds between requests
MAX_RETRIES = 3
RETRY_DELAY = 60  # Wait 60 seconds before retrying after quota exceeded

In [0]:
def get_order_items_with_pagination(order_id, access_token, access_key, secret_key, session_token, retry_count=0):
    """
    Fetch all order items for an order, handling pagination if needed
    Includes rate limiting and retry logic for quota errors
    """
    global SP_API_REGION, SP_API_HOST, DELAY_BETWEEN_REQUESTS, MAX_RETRIES, RETRY_DELAY
    
    all_items = []
    next_token = None
    page = 1
    
    while True:
        if next_token is None:
            rel_url = f"/orders/v0/orders/{order_id}/orderItems"
        else:
            rel_url = f"/orders/v0/orders/{order_id}/orderItems?NextToken={next_token}"
        
        url = f"https://{SP_API_HOST}{rel_url}"
        
        headers = {
            'x-amz-access-token': access_token,
            'Content-Type': 'application/json'
        }
        
        auth = AWS4Auth(
            access_key,
            secret_key,
            SP_API_REGION,
            'execute-api',
            session_token=session_token
        )
        
        try:
            response = requests.get(url, headers=headers, auth=auth)
            response.raise_for_status()
            
            result = response.json()
            
            # Check for quota exceeded error even in successful response
            if 'errors' in result:
                for error in result['errors']:
                    if error.get('code') == 'QuotaExceeded':
                        if retry_count < MAX_RETRIES:
                            print(f"  ⚠️ Quota exceeded for order {order_id}. Waiting {RETRY_DELAY} seconds before retry {retry_count + 1}/{MAX_RETRIES}...")
                            time.sleep(RETRY_DELAY)
                            return get_order_items_with_pagination(order_id, access_token, access_key, secret_key, session_token, retry_count + 1)
                        else:
                            return {
                                "orderId": order_id,
                                "status": "error",
                                "error": f"QuotaExceeded after {MAX_RETRIES} retries",
                                "timestamp": datetime.now(timezone.utc).isoformat()
                            }
            
            # Collect items from this page
            if 'payload' in result and 'OrderItems' in result['payload']:
                all_items.extend(result['payload']['OrderItems'])
            
            # Check for next page
            if 'payload' in result and 'NextToken' in result['payload']:
                next_token = result['payload']['NextToken']
                page += 1
                # Add delay between paginated requests
                time.sleep(DELAY_BETWEEN_REQUESTS)
            else:
                # No more pages
                break
                
        except requests.exceptions.RequestException as e:
            error_detail = str(e)
            try:
                if hasattr(e, 'response') and e.response is not None:
                    error_detail = e.response.text
                    # Check if it's a quota error
                    if 'QuotaExceeded' in error_detail and retry_count < MAX_RETRIES:
                        print(f"  ⚠️ Quota exceeded for order {order_id}. Waiting {RETRY_DELAY} seconds before retry {retry_count + 1}/{MAX_RETRIES}...")
                        time.sleep(RETRY_DELAY)
                        return get_order_items_with_pagination(order_id, access_token, access_key, secret_key, session_token, retry_count + 1)
            except:
                pass
            
            return {
                "orderId": order_id,
                "status": "error",
                "error": error_detail,
                "timestamp": datetime.now(timezone.utc).isoformat()
            }
    
    return {
        "orderId": order_id,
        "status": "success",
        "data": {
            "payload": {
                "OrderItems": all_items,
                "TotalPages": page
            }
        },
        "timestamp": datetime.now(timezone.utc).isoformat()
    }


In [0]:
storageAccountName = "blobinventory"
containerName = "amazon-order-items"
sas = "sp=racwdli&st=2025-11-14T13:59:29Z&se=2029-04-30T22:14:29Z&spr=https&sv=2024-11-0............Kp18yVE%3D"
config = "fs.azure.sas." + containerName + "." + storageAccountName + ".blob.core.windows.net"
mount_point = "/mnt/" + containerName

# Only mount if not already mounted
if not any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    dbutils.fs.mount(
        source = f"wasbs://{containerName}@{storageAccountName}.blob.core.windows.net",
        mount_point = mount_point,
        extra_configs = {config: sas}
    )
    print(f"Mounted {mount_point}")
else:
    print(f"{mount_point} already mounted")

/mnt/amazon-order-items already mounted


In [0]:
# Token refresh configuration
TOKEN_REFRESH_INTERVAL = 30 * 60 
last_token_refresh_time = time.time()
overall_start_time = time.time()

# Loop the dates and get the order items for each date from the order table
for purchase_date in date_list:
    query = f"""
        (
            SELECT DISTINCT order_id 
            FROM logistics_ft_orders 
            WHERE purchase_date = '{purchase_date}'
        ) AS orders_filtered
        """

    df_orders = spark.read.jdbc(
        url=connection_string,
        table=query
    )

    order_count = df_orders.count()
    print(f"Found {order_count} orders for {purchase_date}")

    if order_count == 0:
        print(f"⚠️ No orders found for {purchase_date}, skipping...\n")
        continue

    order_ids = [row.order_id for row in df_orders.select("order_id").collect()]

    all_order_items = []
    failed_orders = []
    start_time = time.time()

    for idx, order_id in enumerate(order_ids):
        # Check if we need to refresh credentials
        new_access_token, new_access_key, new_secret_key, new_session_token, last_token_refresh_time = refresh_credentials_if_needed(last_token_refresh_time, TOKEN_REFRESH_INTERVAL)
        
        if new_access_token is not None:
            # Update credentials
            access_token = new_access_token
            access_key = new_access_key
            secret_key = new_secret_key
            session_token = new_session_token
        
        if (idx + 1) % 20 == 0:
            elapsed = time.time() - start_time
            elapsed_total = time.time() - overall_start_time
            time_since_last_refresh = time.time() - last_token_refresh_time
            avg_time_per_order = elapsed / (idx + 1)
            remaining_orders = len(order_ids) - (idx + 1)
            eta_seconds = remaining_orders * avg_time_per_order
            
            print(f"Progress for {purchase_date}: {idx + 1}/{len(order_ids)} orders | "
                  f"Elapsed: {elapsed/60:.1f}min | ETA: {eta_seconds/60:.1f}min | "
                  f"Token age: {time_since_last_refresh/60:.1f}min")
        
        # Use pagination-aware function with rate limiting
        result = get_order_items_with_pagination(
            order_id, 
            access_token, 
            access_key, 
            secret_key, 
            session_token
        )
        
        # Add purchase_date to the result
        result["purchase_date"] = purchase_date
        
        all_order_items.append(result)
        
        if result["status"] == "error":
            failed_orders.append(order_id)
            print(f"Error for order {order_id} in {purchase_date}: {result['error'][:100]}")
        else:
            pass
        
        time.sleep(DELAY_BETWEEN_REQUESTS)

    # Save data to DBFS
    fileName = f'{purchase_date}_order_items'
    dbfs_path = f'dbfs:/mnt/{containerName}/orders_items/{fileName}.json'
    dbutils.fs.put(dbfs_path, json.dumps(all_order_items, indent=2), overwrite=True)
    
    total_time = time.time() - start_time
    print(f"✅ {fileName} is written. ({len(order_ids)} orders, {total_time/60:.1f}min, {len(failed_orders)} failed)\n")

print(f"\n🎉 All dates processed! Total time: {(time.time() - overall_start_time)/60:.1f} minutes")

Found 489 orders for 2025-11-01
Progress for 2025-11-01: 20/489 orders | Elapsed: 0.7min | ETA: 15.6min | Token age: 0.7min
Progress for 2025-11-01: 40/489 orders | Elapsed: 1.4min | ETA: 15.3min | Token age: 1.4min
Progress for 2025-11-01: 60/489 orders | Elapsed: 2.1min | ETA: 14.8min | Token age: 2.1min
