In [14]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import pandas as pd
import time
from datetime import datetime

# Custom retry logger
def log_request_retry(method, url, retries_left):
    print(f"[Retry] {method} {url} | Retries left: {retries_left}")

# Custom HTTPAdapter with verbose retry logging
class VerboseHTTPAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        self.max_retries_config = kwargs.get("max_retries")
        super().__init__(*args, **kwargs)

    def send(self, request, **kwargs):
        try:
            return super().send(request, **kwargs)
        except Exception as e:
            log_request_retry(request.method, request.url, self.max_retries_config.total)
            raise

# Create session with retries and mount custom adapter
retry_strategy = Retry(
    total=5,
    status_forcelist=[429, 500, 502, 503, 504],
    backoff_factor=1,
    allowed_methods=["GET", "POST"]
)
session = requests.Session()
adapter = VerboseHTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

# Scraping function using session
def scrape_guardian_graphql(product_keywords):
    url = "https://guardianindonesia.co.id/graphql"
    site_url = "https://guardianindonesia.co.id"
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Content-Type": "application/json"
    }

    results = []

    def build_payload(keyword, page):
        return {
            "operationName": "ProductSearch",
            "query": """
                query ProductSearch($currentPage:Int=1, $inputText:String!, $pageSize:Int=6, $filters:ProductAttributeFilterInput!, $sort:ProductAttributeSortInput) {
                  products(currentPage:$currentPage, pageSize:$pageSize, search:$inputText, filter:$filters, sort:$sort) {
                    items {
                      name
                      url_key
                      stock_status
                      price_range {
                        minimum_price {
                          final_price { value }
                          regular_price { value }
                          discount { percent_off }
                        }
                      }
                      promo
                      small_image { url }
                    }
                    page_info {
                      current_page
                      total_pages
                    }
                  }
                }
            """,
            "variables": {
                "currentPage": page,
                "pageSize": 12,
                "filters": {},
                "inputText": keyword,
                "sort": {"relevance": "DESC", "stock_status": "DESC"}
            }
        }

    for keyword in product_keywords:
        print(f"🔍 Searching for '{keyword}'...")
        page = 1
        while True:
            payload = build_payload(keyword, page)
            print(f"  → Requesting page {page}")

            try:
                response = session.post(url, headers=headers, json=payload)
            except Exception as e:
                print(f"  ✖ Request error on page {page}: {e}")
                break

            if response.status_code != 200:
                print(f"  ✖ Failed to get data (status {response.status_code}) for {keyword} page {page}")
                break

            data = response.json()
            product_data = data.get("data", {}).get("products", {})
            items = product_data.get("items", [])
            page_info = product_data.get("page_info", {})
            total_pages = page_info.get("total_pages", 1)

            if not items:
                print(f"  ✱ No items found on page {page}")
                break

            for item in items:
                try:
                    name = item.get("name", "")
                    final_price = item["price_range"]["minimum_price"]["final_price"]["value"]
                    regular_price = item["price_range"]["minimum_price"]["regular_price"]["value"]
                    discount = item["price_range"]["minimum_price"]["discount"].get("percent_off", 0)
                    promo = item.get("promo") or "-"
                    stock_status = item.get("stock_status", "")
                    url_key = item.get("url_key", "")
                    product_url = f"{site_url}/{url_key}.html"
                    image_url = item.get("small_image", {}).get("url", "")

                    results.append({
                        "Keyword": keyword,
                        "Name": name,
                        "Price": final_price,
                        "Regular Price": regular_price,
                        "Discount (%)": discount,
                        "Promo": promo,
                        "Stock Status": stock_status,
                        "Product URL": product_url,
                        "Image URL": image_url
                    })
                except Exception as e:
                    print(f"    ⚠ Error parsing item: {e}")

            if page >= total_pages:
                break

            page += 1
            time.sleep(1)

    return results


In [15]:
# 🔍 Example product keywords
targets = ['clear', 'sunsilk', 'lifebuoy', 'tresemme', 'ponds',
           'glow', 'vaseline', 'pepsodent','dove', 'colgate',
           'closeup', 'lux', 'rexona', 'axe',
           'head', 'pantene', 'zinc', 'garnier', 'nivea',
           'marina', 'ciptadent', 'nuvo', 'giv', 'posh','citra', 'rejoice']
data = scrape_guardian_graphql(targets)

🔍 Searching for 'clear'...
  → Requesting page 1
  → Requesting page 2
  → Requesting page 3
  → Requesting page 4
  → Requesting page 5
  → Requesting page 6
  → Requesting page 7
  → Requesting page 8
  → Requesting page 9
  → Requesting page 10
  → Requesting page 11
  → Requesting page 12
  → Requesting page 13
  → Requesting page 14
  → Requesting page 15
  → Requesting page 16
  → Requesting page 17
  → Requesting page 18
  → Requesting page 19
🔍 Searching for 'sunsilk'...
  → Requesting page 1
  → Requesting page 2
🔍 Searching for 'lifebuoy'...
  → Requesting page 1
  → Requesting page 2
  → Requesting page 3
  → Requesting page 4
🔍 Searching for 'tresemme'...
  → Requesting page 1
  → Requesting page 2
  → Requesting page 3
🔍 Searching for 'ponds'...
  → Requesting page 1
  → Requesting page 2
  → Requesting page 3
  → Requesting page 4
  → Requesting page 5
  → Requesting page 6
  → Requesting page 7
  → Requesting page 8
  → Requesting page 9
🔍 Searching for 'glow'...
  → Req

In [18]:
# 💾 Save to Excel
df = pd.DataFrame(data)
file_name = f"../../guardian/GUARDIAN_{datetime.now().strftime('%y%m%d')}.xlsx"
df.to_excel(file_name, index=False)
print("✅ Done. Saved to 'guardian_products_graphql.xlsx'")

✅ Done. Saved to 'guardian_products_graphql.xlsx'
