In [13]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Function to convert price string into a float/int
def convert_price(price_str):
    # Remove "Rp" and "." then convert to int
    return int(price_str.replace("Rp", "").replace(".", "").strip())

targets = ['clear shampoo', 'sunsilk shampoo', 'lifebuoy', 'tresemme shampoo', 'ponds',
           'glow lovely', 'vaseline', 'pepsodent',
           'closeup', 'lifebuoy sabun mandi', 'lux botanicals', 'rexona', 'axe', 'molto', 'sunlight',
           'wipol', 'vixal', 'royco', 'bango kecap', 'sariwangi',  'buavita',
           'head shoulders', 'pantene shampoo', 'zinc', 'garnier', 'nivea',
           'marina', 'ciptadent pasta gigi', 'nuvo', 'giv', 'posh men body spray',
           'soklin', 'downy pelembut', 'garnier sakura',
           'mama lemon', 'supersol', 'yuri porstex', 'masako penyedap rasa ayam',
           'sedaap kecap', 'abc kecap manis', 'sosro teh', 'sosro teh asli', 'tong tji',
           'country choice', 'citra', 'superpel','rinso', 'rejoice', 'posh']


In [14]:
import requests
import pandas as pd
import time
import uuid

# Initialize an empty DataFrame to store the cumulative results
all_products_df = pd.DataFrame(columns=["productName", "basePrice", "finalPrice", "discountPercent", "plu", "location"])

def fetch_products(keyword, latitude, longitude, store_code, district_id, location, page=0, size=15):
    global all_products_df  # Declare the global variable to update it inside the function

    # Define the base URL
    url = "https://ap-mc.klikindomaret.com/assets-klikidmsearch/api/get/catalog-xpress/api/webapp/search/result"
    
    # Request parameters
    params = {
        "keyword": keyword,
        "page": page,
        "size": size,
        "storeCode": store_code,
        "latitude": latitude,
        "longitude": longitude,
        "mode": "DELIVERY",
        "isUserFiltered": "false",
        "districtId": district_id
    }

    # Generate random UUIDs for correlation and device ID
    correlation_id = str(uuid.uuid4())
    device_id = str(uuid.uuid4())

    # Headers to mimic real browser behavior
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "x-correlation-id": correlation_id,
        "apps": f'{{"app_version":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0","device_class":"browser^|browser","device_family":"none","device_id":"{device_id}","os_name":"Windows","os_version":"10"}}',
        "page": "unpage",
        "Origin": "https://www.klikindomaret.com",
        "Connection": "keep-alive",
        "Referer": "https://www.klikindomaret.com/",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-site",
        "TE": "trailers"
    }

    # Send the request
    response = requests.get(url, params=params, headers=headers)
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch data: {response.status_code}\n{response.text}")
    
    # Parse the JSON response
    data = response.json()
    if data.get("status") != "00":
        raise ValueError(f"API response error: {data.get('message', 'Unknown error')}")

    # Extract product information
    products_data = []
    if not data['data']['additionalData'].get('notFound'):
        for product in data["data"]["content"]:
            title_text = product.get("productName", "Unknown").strip()
            base_price = product.get("price")
            final_price = product.get("finalPrice") or base_price
            discount_text = product.get("discountText") or "0%"
            discount_percentage = float(discount_text.replace("%", "").strip()) if discount_text else 0
            plu_data = product.get("plu")
            
            products_data.append([title_text, base_price, final_price, discount_percentage, plu_data])
    
    # Create and append the DataFrame
    promo_sku = pd.DataFrame(products_data, columns=["productName", "basePrice", "finalPrice", "discountPercent", "plu"])
    promo_sku["location"] = location
    promo_sku.drop_duplicates(subset=["productName","location"], inplace=True)

    all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')

# Example usage
latitudes = [-6.4640117, -6.988415]
longitudes = [107.07297, 107.64267]
store_codes = ["TBG2", "TBDG"]
district_ids = [141205778, 141204783]
locations = ["Bogor", "Bandung"]


for keyword in targets:
    for i in range(len(locations)):
        fetch_products(
            keyword=keyword,
            latitude=latitudes[i],
            longitude=longitudes[i],
            store_code=store_codes[i],
            district_id=district_ids[i],
            location=locations[i],
            size=100
        )
        time.sleep(5)

# Display the cumulative DataFrame
print(all_products_df)


  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')
  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')
  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')


                                          productName basePrice finalPrice  \
0   Clear Shampoo Anti Dandruff Fresh Cool Lemon 1...     35500      35500   
1     Clear Shampoo Anti Dandruff Menthol Segar 400mL     70500      52500   
2   Clear Shampoo Anti Dandruff Ice Cool Menthol 3...     63000      63000   
3   Clear Shampoo Anti Dandruff Ice Cool Menthol F...     35500      35500   
4       Clear Shampoo Anti Dandruff Fresh Apple 160mL     35500      23900   
..                                                ...       ...        ...   
8               Posh Men Deo Roll On Active Cool 50Ml     17200      13100   
9                     Posh Deo Roll On Whitening 50Ml     17200      13100   
10  Posh Body Spray Perfumed Hijab Chic Green Blos...     24900      17900   
11          Posh Body Spray Perfumed Blaze Pink 150mL     24900      17900   
12  Posh Body Spray Perfumed Hijab Chic Purple Wis...     24900      17900   

    discountPercent       plu location  
0               0.0  2

In [15]:
# import requests

# url = "https://ap-mc.klikindomaret.com/assets-klikidmgroceries/api/get/catalog-xpress/api/webapp/category/meta"
# params = {
#     "storeCode": "TJKT",
#     "latitude": "-6.1763897",
#     "longitude": "106.82667",
#     "mode": "DELIVERY",
#     "districtId": "141100100"
# }

# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0",
#     "Accept": "application/json, text/plain, */*",
#     "Accept-Language": "en-US,en;q=0.5",
#     "Accept-Encoding": "gzip, deflate, br, zstd",
#     "x-correlation-id": "ddffd180-8f36-4bd2-9f5b-e80ee4eaf702",
#     "apps": '{"app_version":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0","device_class":"browser^|browser","device_family":"none","device_id":"84ffddd6-5338-4726-839c-e55f16d24074","os_name":"Windows","os_version":"10"}',
#     "page": "unpage",
#     "Origin": "https://www.klikindomaret.com",
#     "Connection": "keep-alive",
#     "Referer": "https://www.klikindomaret.com/",
#     "Sec-Fetch-Dest": "empty",
#     "Sec-Fetch-Mode": "cors",
#     "Sec-Fetch-Site": "same-site",
#     "If-None-Match": '"wwq6pe370cnii"',
#     "TE": "trailers"
# }

# response = requests.get(url, headers=headers, params=params)

# # If 200, show JSON result
# if response.status_code == 200:
#     print(response.json())
# else:
#     print(f"Blocked or failed: {response.status_code}")
#     print(response.text)


In [16]:
# import requests
# import pandas as pd
# import time

# # Initialize an empty DataFrame to store the cumulative results
# all_products_df = pd.DataFrame(columns=["productName", "basePrice", "finalPrice", "discountPercent", "plu", "location"])

# def fetch_products(keyword, latitude, longitude, store_code, district_id, location, page=0, size=15):
#     global all_products_df  # Declare the global variable to update it inside the function
    
#     # Define the base URL
#     url = f"https://ap-mc.klikindomaret.com/assets-klikidmsearch/api/get/catalog-xpress/api/webapp/search/result"
    
#     # Request parameters
#     params = {
#         "keyword": keyword,
#         "page": page,
#         "size": size,
#         "storeCode": store_code,
#         "latitude": latitude,
#         "longitude": longitude,
#         "mode": "DELIVERY",
#         "isUserFiltered": "false",
#         "districtId": district_id
#     }

#     # Send the request
#     response = requests.get(url, params=params)
#     if response.status_code != 200:
#         raise ValueError(f"Failed to fetch data: {response.status_code}")
    
#     # Parse the JSON response
#     data = response.json()
#     if data["status"] != "00":
#         raise ValueError(f"API response error: {data['message']}")
    
#     # Extract product information
#     products_data = []
#     if not data['data']['additionalData'].get('notFound'):
#         for product in data["data"]["content"]:
#             title_text = product.get("productName", "Unknown").strip()
#             base_price = product.get("price")
#             final_price = product.get("finalPrice") or base_price
#             discount_text = product.get("discountText") or "0%"  # Use "0%" if discountText is None
#             discount_text = discount_text.replace("%", "").strip()
#             discount_percentage = float(discount_text) if discount_text else 0
#             plu_data = product.get("plu")
            
#             # Append the product details
#             products_data.append([title_text, base_price, final_price, discount_percentage, plu_data])
    
#     # Create a DataFrame for the current results with location column
#     promo_sku = pd.DataFrame(products_data, columns=["productName", "basePrice", "finalPrice", "discountPercent", "plu"])
#     promo_sku["location"] = location  # Add a column for location
#     promo_sku.drop_duplicates(subset=["productName"], inplace=True)
    
#     # Append the current DataFrame to the global DataFrame
#     all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')

# # Example usage
# latitude = -6.4640117
# longitude = 107.07297
# store_code = "TBG2"
# district_id = 141205778
# location = "Bogor"


# for keyword in targets:
#     fetch_products(keyword, latitude, longitude, store_code, district_id, location, size=100)
#     time.sleep(5)

# # Display the cumulative DataFrame
# print(all_products_df)


In [17]:
import pandas as pd
from datetime import datetime


check = ['clear', 'sunsilk', 'lifebuoy', 'tresemme', "pond's",
           'glow', 'vaseline', 'pepsodent',
           'close up', 'lifebuoy', 'lux', 'rexona', 'axe', 'molto', 'sunlight',
           'wipol', 'vixal', 'royco', 'bango', 'sariwangi', 'sarimurni', 'buavita',
           'head', 'pantene', 'zinc', 'garnier', 'nivea',
           'marina', 'ciptadent', 'nuvo', 'giv', 'posh',
           'so klin',  'downy', 'mama lemon', 'supersol',
           'yuri', 'masako', 'garnier sakura glow water gel',
           'sedaap kecap', 'abc kecap manis', 'sosro teh', 'tong tji', 'teh bendera',
           'country choice', 'citra', 'super pel', 'rinso']

# Convert 'productName' column to lowercase for case-insensitive matching
product_names = all_products_df['productName'].str.lower()

# Check for items in 'check' that are not found in any of the product names
missing_in_df = [item for item in check if not product_names.str.contains(item).any()]

# Display the list of items from 'check' that are not in the DataFrame
print("Items from the check list not found in the product names:")
print(missing_in_df)

Items from the check list not found in the product names:
['sarimurni', 'teh bendera', 'country choice']


In [18]:
for product in missing_in_df:
    try:
        for i in range(len(locations)):
            fetch_products(
                keyword=product,
                latitude=latitudes[i],
                longitude=longitudes[i],
                store_code=store_codes[i],
                district_id=district_ids[i],
                location=locations[i],
                size=100
            )
            time.sleep(5)

        # print(promo_sku)
    except Exception as e:
        print(f"Error scraping {product}: {e}")
        continue

  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')
  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')
  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')
  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')
  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')
  all_products_df = pd.concat([all_products_df, promo_sku]).drop_duplicates(subset=["productName"], keep='first')


In [19]:
from datetime import datetime

file_name = f"../indomaret/INDOMARET_{datetime.now().strftime('%y%m%d')}.xlsx"
all_products_df.to_excel(file_name,index=False)

In [20]:
# import requests
# promo_sku = pd.DataFrame(columns=["productName", "basePrice", "finalPrice", "discountPercent"])
# def get_product(product,promo_sku):
#     # Define the URL and headers
#     url = f"https://ap-mc.klikindomaret.com/assets-klikidmsearch/api/get/catalog-xpress/api/webapp/search/result?keyword=clear&isUserFiltered=false&page=0&size=15&storeCode=TBG2&latitude=-6.4640117&longitude=107.07297&mode=DELIVERY&districtId=141205778"
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
#     }

#     # Make the initial request
#     response = requests.get(url, headers=headers)
    
#     print(response)

#     # Now you can make another request with updated cookies
#     response_with_updated_cookies = requests.get(url, headers=headers)

#     # Check the response
#     # print(response_with_updated_cookies.cookies)
#     soup = BeautifulSoup(response_with_updated_cookies.text, "html.parser")
#     print(soup)
        
#     #Check if the search result is empty
#     if not soup.select_one('.produk .rightside .box-item.wrp-noFilter'):
#         print(f"No result for")
#     products = soup.find_all("div", class_="each-item")

#     # List to hold the product data
#     products_data = []

#     # Loop through the products and extract the title and price
#     for product in products:
#         # Extract the title
#         title = product.find("div", class_="title")
#         if title is not None:
#             title_text = title.text.strip()
#             # Extract the price
#             final_price = product.find("span", class_="normal price-value")
#             if final_price:
#                 final_price = convert_price(final_price.text.strip())
            
#             base_price = product.find("span", class_="strikeout disc-price")
#             discount_percentage = product.find("span", class_="discount")
#             # Find the button element
#             button = product.find('button', class_='buyBtn')

#             # Check if the button exists and has the 'data-plu' attribute
#             if button and 'data-plu' in button.attrs:
#                 plu_data = button['data-plu']
#             else:
#                 print(f"No data-plu for {title_text}")
#                 plu_data = None  # If 'data-plu' is not available, set it to None
            
#             if base_price and discount_percentage:
#                 base_price = convert_price(base_price.text.strip().split("\n")[-1])
#                 discount_percentage = float(discount_percentage.text.strip().replace("%", "").strip())
#             else:
#                 base_price = final_price
#                 discount_percentage = 0
#             products_data.append([title_text, base_price, final_price, discount_percentage, plu_data])
#     promo_sku = pd.concat([promo_sku, pd.DataFrame(products_data, columns=["productName", "basePrice", "finalPrice", "discountPercent","plu"])], ignore_index=True)

#     promo_sku.drop_duplicates(subset=["productName"], inplace=True)
#     # print(promo_sku)
#     return promo_sku
# # print(products_data)

In [21]:
import time
# import random
# import pandas as pd
# from bs4 import BeautifulSoup
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# from selenium.webdriver.chrome.options import Options
# from selenium_stealth import stealth

# def scrape_indomaret_products(product,promo_sku):
#     # Set up Chrome options
#     options = Options()
#     # Uncomment for headless mode
#     options.add_argument("--headless")
#     driver = webdriver.Chrome(options=options)

#     # Configure stealth settings to avoid detection
#     stealth(driver,
#            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.105 Safari/537.36',
#            languages=["en-US", "en"],
#            vendor="Google Inc.",
#            platform="Win32",
#            webgl_vendor="Intel Inc.",
#            renderer="Intel Iris OpenGL Engine",
#            fix_hairline=True,
#     )

#     # Initialize list to collect product data
#     products_data = []

#     # Define URL
#     url = f'https://www.klikindomaret.com/search?keyword={product}'
#     driver.get(url)
#     # Wait for product containers to load on the page
#     try:
#         WebDriverWait(driver, 20).until(
#             EC.presence_of_element_located((By.CLASS_NAME, "each-item"))
#         )
#     except Exception as e:
#         print(f"Error waiting for page to load: {e}")
#         driver.quit()
#         return promo_sku


#     # Get page source and parse with BeautifulSoup
#     page_source = driver.page_source
#     soup = BeautifulSoup(page_source, 'html.parser')
#     # print(soup)

#     # Find all product containers
#     product_containers = soup.find_all('div', class_="each-item")

#     # Extract information for each product
#     for product in product_containers:
#         # Extract the title
#         title = product.find("div", class_="title")
#         if title is not None:
#             title_text = title.text.strip()
#             # Extract the price
#             final_price = product.find("span", class_="normal price-value")
#             if final_price:
#                 final_price = convert_price(final_price.text.strip())
            
#             base_price = product.find("span", class_="strikeout disc-price")
#             discount_percentage = product.find("span", class_="discount")
#             # Find the button element
#             button = product.find('button', class_='buyBtn')

#             # Check if the button exists and has the 'data-plu' attribute
#             if button and 'data-plu' in button.attrs:
#                 plu_data = button['data-plu']
#             else:
#                 print(f"No data-plu for {title_text}")
#                 plu_data = None  # If 'data-plu' is not available, set it to None
            
#             if base_price and discount_percentage:
#                 base_price = convert_price(base_price.text.strip().split("\n")[-1])
#                 discount_percentage = float(discount_percentage.text.strip().replace("%", "").strip())
#             else:
#                 base_price = final_price
#                 discount_percentage = 0
#             products_data.append([title_text, base_price, final_price, discount_percentage, plu_data])
#     promo_sku = pd.concat([promo_sku, pd.DataFrame(products_data, columns=["productName", "basePrice", "finalPrice", "discountPercent","plu"])], ignore_index=True)
#     # Drop duplicate products by product name
#     promo_sku.drop_duplicates(subset=["productName"], inplace=True)

#     # Close the driver
#     driver.quit()

#     return promo_sku

# promo_sku = pd.DataFrame(columns=["productName", "basePrice", "finalPrice", "discountPercent"])
# for product in targets:
#     try:
#         promo_sku = scrape_indomaret_products(product,promo_sku)
#         print(promo_sku)
#     except Exception as e:
#         print(f"Error scraping {product}: {e}")
#         continue


In [22]:
import requests
url = 'https://ap-mc.klikindomaret.com/assets-klikidmsearch/api/get/catalog-xpress/api/webapp/search/result?keyword=clear shampoo&isUserFiltered=false&page=0&size=15&storeCode=TBG2&latitude=-6.4640117&longitude=107.07297&mode=DELIVERY&districtId=141205778'

res = requests.get(url)
print(res.text)

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>Attention Required! | Cloudflare</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/cf.errors.css" />
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" /><![endif]-->
<style>body{margin:0;padding:0}</style>


<!--[if gte IE 10]><!-->
<script>
  if (!navigator.cookieEnabled) {
    window.addEventListener('DOMContentLoaded', f