# Testing Config

In [22]:
import requests
from bs4 import BeautifulSoup

url = "https://www.industrybuying.com/power-tools-641/saws-chasers-17991"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

response = requests.get(url, headers=headers)
print(response.status_code)

200


In [23]:
soup = BeautifulSoup(response.text, "lxml")
print(soup.title.text)


Buy Saws & Chasers Online for Industrial Use


In [24]:
products = soup.find_all("div", class_="IB-prod")
print("Total products:", len(products))


Total products: 0


In [16]:
for product in products:
    name = product.find("span", class_="IB-prod-name")
    price = product.find("span", class_="price")

    if name and price:
        print(name.text.strip(), price.text.strip())


# Capturing Products Brand

In [17]:
brand = product.find("div", class_="brand")
brand = product.find("div", class_="brand")

brand_name = brand.text.strip() if brand else "N/A"


NameError: name 'product' is not defined

In [25]:
import requests
from bs4 import BeautifulSoup
import json

url = "https://www.industrybuying.com/power-tools-641/saws-chasers-17991"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, "lxml")

# find structured product data
scripts = soup.find_all("script", type="application/ld+json")

print(len(scripts))


4


In [26]:
products = []

for script in scripts:
    try:
        data = json.loads(script.string)

        # some pages store list, some store single object
        if isinstance(data, list):
            for item in data:
                if item.get("@type") == "Product":
                    products.append(item)

        elif data.get("@type") == "Product":
            products.append(data)

    except:
        pass

print(len(products))


1


In [27]:
for p in products:
    name = p["name"]
    brand = p["brand"]["name"]
    price = p["offers"]["price"]

    print(name, brand, price)


PowerHouse 355 mm Chop saw Machine PHCM355 with 6 Months Warranty Multi Color PowerHouse 5899


In [29]:
import pandas as pd

rows = []

for p in products:
    rows.append([
        p["name"],
        p["brand"]["name"],
        p["offers"]["price"]
    ])

df = pd.DataFrame(rows, columns=["Product", "Brand", "Price"])

print(df)

# df.to_csv("industrybuying_saws.csv", index=False)
df

                                             Product       Brand  Price
0  PowerHouse 355 mm Chop saw Machine PHCM355 wit...  PowerHouse   5899


Unnamed: 0,Product,Brand,Price
0,PowerHouse 355 mm Chop saw Machine PHCM355 wit...,PowerHouse,5899


# Pagination Test

In [33]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0"
}

all_products = []

def extract_products(data):
    products = []

    def traverse(obj):
        if isinstance(obj, dict):
            if obj.get("type") == "PRODUCT_SUMMARY_LIST":
                items = obj["data"]["renderableComponents"]
                for item in items:
                    value = item["value"]

                    name = value["title"]
                    brand = value["productBrand"].replace("By ", "")
                    price = value["pricing"]["prices"][0]["value"]

                    products.append([name, brand, price])

            for v in obj.values():
                traverse(v)

        elif isinstance(obj, list):
            for i in obj:
                traverse(i)

    traverse(data)
    return products


for page in range(1, 18):

    print("Scraping page:", page)

    url = f"https://www.industrybuying.com/power-tools-641/saws-chasers-17991?page={page}"

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")

    scripts = soup.find_all("script")

    target_script = None

    for script in scripts:
        if script.string and "PRODUCT_SUMMARY_LIST" in script.string:
            target_script = script.string
            break

    start = target_script.find("{")
    end = target_script.rfind("}") + 1

    data = json.loads(target_script[start:end])

    page_products = extract_products(data)
    all_products.extend(page_products)

    time.sleep(2)

df = pd.DataFrame(all_products, columns=["Product","Brand","Price"])

print("TOTAL PRODUCTS:", len(df))

# df.to_csv("industrybuying_all_products.csv", index=False)
df

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping page: 13
Scraping page: 14
Scraping page: 15
Scraping page: 16
Scraping page: 17
TOTAL PRODUCTS: 680


Unnamed: 0,Product,Brand,Price
0,"Ingco 1500W 4500 RPM Table Saw , TS15008",Ingco,36579
1,Ingco 85 W 1450 rpm Scroll Saw Machine SS852,Ingco,8613
2,"Ingco WLC15008 Wall Chaser Grinder, with 4 Dis...",Ingco,9439
3,Total TAC52644D 150 X 19 X 1.25 mm Reciprocati...,Total,145
4,VOLTZ VZWC -200 800W Wood Table Saw Machine,Voltz,8849
...,...,...,...
675,"XLNT 1100 W Cordless Chainsaw 7800 RPM Speed, ...",XLNT,6017
676,Stihl TS-420 Petrol Operated Cut off saw with ...,Stihl,93219
677,"Omada Hand Held Wall Chaser Cutter 2800W, OMD-...",Omada,16519
678,Oem 2400W 2400 Watt Professional Cut Off/Chop ...,OEM,9439


# Final Dataframe 

In [35]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import math

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

BASE_URL = "https://www.industrybuying.com/power-tools-641/saws-chasers-17991?page={}"


# ---------- STEP 1 : Get total number of products ----------

def get_total_products():

    url = BASE_URL.format(1)

    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "lxml")

    scripts = soup.find_all("script")

    for script in scripts:
        if script.string and "PRODUCT_SUMMARY_LIST" in script.string:

            text = script.string
            start = text.find("{")
            end = text.rfind("}") + 1

            data = json.loads(text[start:end])

            # recursively search total count
            def find_total(obj):
                if isinstance(obj, dict):
                    for k, v in obj.items():
                        if k == "totalProductCount":
                            return v
                        result = find_total(v)
                        if result:
                            return result
                elif isinstance(obj, list):
                    for i in obj:
                        result = find_total(i)
                        if result:
                            return result
                return None

            total_products = find_total(data)
            return total_products

    return None


# ---------- STEP 2 : Extract product info ----------

def extract_products(data):

    products = []

    def traverse(obj):

        if isinstance(obj, dict):

            if obj.get("type") == "PRODUCT_SUMMARY_LIST":

                items = obj["data"]["renderableComponents"]

                for item in items:
                    value = item["value"]

                    name = value.get("title")

                    brand = value.get("productBrand", "")
                    brand = brand.replace("By ", "")

                    # Prices
                    selling_price = None
                    tax_price = None

                    for p in value["pricing"]["prices"]:
                        if p["priceType"] == "SELLING_PRICE":
                            selling_price = p.get("value")
                        if p["priceType"] == "TAX_EXCLUSIVE_PRICE":
                            tax_price = p.get("value")

                    # SKU
                    sku = value["addToCart"]["action"]["params"]["sku"]

                    # URL
                    product_url = value["renderableComponents"][0]["action"]["params"]["url"]

                    # Image
                    image_url = value["media"]["images"][0]["url"]

                    # Stock
                    in_stock = value.get("inStock")

                    # Lead time
                    lead_time = value.get("skuLeadTime")

                    # Rating
                    rating = value.get("rating", {}).get("average")

                    # Description
                    description = value.get("description")

                    products.append([
                        name,
                        brand,
                        selling_price,
                        tax_price,
                        sku,
                        product_url,
                        image_url,
                        in_stock,
                        lead_time,
                        rating,
                        description
                    ])

            for v in obj.values():
                traverse(v)

        elif isinstance(obj, list):
            for i in obj:
                traverse(i)

    traverse(data)
    return products


# ---------- STEP 3 : SCRAPE ALL PAGES ----------

def scrape_all_pages():

    total_products = get_total_products()

    if not total_products:
        print("Could not detect total products. Using fallback pages = 20")
        total_pages = 20
    else:
        per_page = 40
        total_pages = math.ceil(total_products / per_page)

    print("Total Products:", total_products)
    print("Total Pages:", total_pages)

    all_products = []

    for page in range(1, total_pages + 1):

        print(f"\nScraping Page {page}/{total_pages}")

        url = BASE_URL.format(page)

        try:
            response = requests.get(url, headers=HEADERS, timeout=20)
            soup = BeautifulSoup(response.text, "lxml")

            scripts = soup.find_all("script")

            target_script = None
            for script in scripts:
                if script.string and "PRODUCT_SUMMARY_LIST" in script.string:
                    target_script = script.string
                    break

            if not target_script:
                print("No data found on page", page)
                continue

            start = target_script.find("{")
            end = target_script.rfind("}") + 1

            data = json.loads(target_script[start:end])

            page_products = extract_products(data)

            print("Products found:", len(page_products))

            all_products.extend(page_products)

            time.sleep(2)  # polite delay

        except Exception as e:
            print("Error on page", page, e)

    return all_products


# ---------- MAIN ----------

if __name__ == "__main__":

    all_products = scrape_all_pages()

    df = pd.DataFrame(all_products, columns=[
        "Product",
        "Brand",
        "Selling Price",
        "Tax Exclusive Price",
        "SKU",
        "Product URL",
        "Image URL",
        "In Stock",
        "Lead Time",
        "Rating",
        "Description"
    ])

    print("\nTOTAL ROWS:", len(df))

    # df.to_csv("industrybuying_complete_dataset.csv", index=False)

    print("\nDataset saved as: industrybuying_complete_dataset.csv")
    print(df)

Total Products: 3
Total Pages: 1

Scraping Page 1/1
Products found: 40

TOTAL ROWS: 40

Dataset saved as: industrybuying_complete_dataset.csv
                                              Product               Brand  \
0            Ingco 1500W 4500 RPM Table Saw , TS15008               Ingco   
1        Ingco 85 W 1450 rpm Scroll Saw Machine SS852               Ingco   
2   Ingco WLC15008 Wall Chaser Grinder, with 4 Dis...               Ingco   
3   Total TAC52644D 150 X 19 X 1.25 mm Reciprocati...               Total   
4        VOLTZ  VZWC -200 800W Wood Table Saw Machine               Voltz   
5   Dongcheng 150 mm 1400W Electric Groove Cutter/...           Dongcheng   
6   Voltz Wood Table Saw with 24 & 60 Teeth blades...               Voltz   
7   Dongcheng 2000 W 3800 RPM Electric Cut-Off Mac...           Dongcheng   
8   Dewalt 14 Inch Corded Electric Heavy Duty Chop...              Dewalt   
9             Ingco 5000 RPM 1400W Mitre Saw BMS14007               Ingco   
10  Ingco 2

# Main Category Formation

In [38]:
import requests
import pandas as pd
from datetime import datetime
import time

URL = "https://www.industrybuying.com/api/catalog/getProductListing"

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Content-Type": "application/json"
}

all_products = []

def scrape_page(page):

    payload = {
        "pageUri": "/power-tools-641",
        "pageNumber": page
    }

    response = requests.post(URL, json=payload, headers=HEADERS)

    if response.status_code != 200:
        print("Failed page", page)
        return

    data = response.json()

    products = data.get("products", [])

    for p in products:

        record = {
            "entity_name": p.get("title"),
            "category": "Power Tools",
            "location": "India",
            "selling_price": p.get("sellingPrice"),
            "rating_count": p.get("ratingCount"),
            "source_url": "https://www.industrybuying.com" + p.get("productUrl"),
            "scraped_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        }

        all_products.append(record)


# paginate
for page in range(1, 3):
    print("Scraping page", page)
    scrape_page(page)
    time.sleep(1)

df = pd.DataFrame(all_products)
print("Total rows:", len(df))

df.to_csv("raw_data.csv", index=False)


Scraping page 1
Failed page 1
Scraping page 2
Failed page 2
Total rows: 0


In [39]:
import requests
from bs4 import BeautifulSoup

BASE = "https://www.industrybuying.com"
url = BASE + "/power-tools-641"

headers = {"User-Agent": "Mozilla/5.0"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")

categories = set()

for a in soup.find_all("a", href=True):
    href = a["href"]

    # actual subcategory pattern
    if href.startswith("/power-tools-641/") and href.count("/") == 2:
        categories.add(BASE + href)

for c in sorted(categories):
    print(c)

print("\nTotal categories:", len(categories))


https://www.industrybuying.com/power-tools-641/all-products
https://www.industrybuying.com/power-tools-641/angle-grinders-889
https://www.industrybuying.com/power-tools-641/armatures-5299
https://www.industrybuying.com/power-tools-641/battery-chargers-17945
https://www.industrybuying.com/power-tools-641/bench-drill-press-17525
https://www.industrybuying.com/power-tools-641/blowers-heat-guns-17940
https://www.industrybuying.com/power-tools-641/building-construction-19707
https://www.industrybuying.com/power-tools-641/carbon-brush-14349
https://www.industrybuying.com/power-tools-641/chain-saw-16503
https://www.industrybuying.com/power-tools-641/chisels-bull-points-6307
https://www.industrybuying.com/power-tools-641/chop-saw-896
https://www.industrybuying.com/power-tools-641/circular-saw-17775
https://www.industrybuying.com/power-tools-641/concrete-vibrators-17975
https://www.industrybuying.com/power-tools-641/cordless-drills-16333
https://www.industrybuying.com/power-tools-641/cordless-i

In [42]:
import requests
from bs4 import BeautifulSoup

BASE = "https://www.industrybuying.com"
url = BASE + "/categories"

headers = {"User-Agent": "Mozilla/5.0"}

soup = BeautifulSoup(requests.get(url, headers=headers).text, "lxml")

main_categories = set()

for a in soup.find_all("a", href=True):
    href = a["href"]

    # pattern: /power-tools-641
    if href.startswith("/") and "-" in href and href.split("-")[-1].isdigit():
        if href.count("/") == 1:
            main_categories.add(BASE + href)

for c in sorted(main_categories):
    print(c)

print("\nTotal main categories:", len(main_categories))



Total main categories: 0


# Final Code Test

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
from datetime import datetime

BASE = "https://www.industrybuying.com"
HEADERS = {"User-Agent": "Mozilla/5.0"}

MAX_CATEGORIES = 10
MAX_PER_CATEGORY = 50

# -----------------------------------
# 1. Get subcategories
# -----------------------------------
def get_subcategories():

    url = BASE + "/power-tools-641"
    soup = BeautifulSoup(requests.get(url, headers=HEADERS).text, "lxml")

    subcats = []

    for a in soup.find_all("a", href=True):
        href = a["href"]

        if href.startswith("/power-tools-641/") and href.count("/") == 2:
            if "all-products" not in href:
                full_url = BASE + href
                if full_url not in subcats:
                    subcats.append(full_url)

    # limit to 10
    return subcats[:MAX_CATEGORIES]


# -----------------------------------
# 2. Extract product JSON
# -----------------------------------
def extract_products(data, page_url, category_name):

    products = []

    def traverse(obj):

        if isinstance(obj, dict):

            if obj.get("type") == "PRODUCT_SUMMARY_LIST":

                for item in obj["data"]["renderableComponents"]:

                    value = item["value"]

                    record = {
                        "category": category_name,
                        "page_url": page_url,
                        "scraped_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
                        "raw_product_json": json.dumps(value)   # FULL PRODUCT DATA
                    }

                    products.append(record)

            for v in obj.values():
                traverse(v)

        elif isinstance(obj, list):
            for i in obj:
                traverse(i)

    traverse(data)
    return products

# -----------------------------------
# 3. Scrape one subcategory
# -----------------------------------
def scrape_subcategory(subcat_url):

    print("\nScraping:", subcat_url)

    category_name = subcat_url.split("/")[-1]
    collected = []
    page = 1

    while len(collected) < MAX_PER_CATEGORY:

        url = f"{subcat_url}?page={page}"
        print("  Page:", page)

        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text, "lxml")

        # find hydration script
        target_script = None
        for script in soup.find_all("script"):
            if script.string and "PRODUCT_SUMMARY_LIST" in script.string:
                target_script = script.string
                break

        if not target_script:
            break

        start = target_script.find("{")
        end = target_script.rfind("}") + 1
        data = json.loads(target_script[start:end])

        page_products = extract_products(data, url, category_name)

        if not page_products:
            break

        for p in page_products:
            if len(collected) < MAX_PER_CATEGORY:
                collected.append(p)
            else:
                break

        page += 1
        time.sleep(2)

    print("  Collected:", len(collected))
    return collected


# -----------------------------------
# 4. Main
# -----------------------------------
def main():

    dataset = []

    subcategories = get_subcategories()
    print("Selected subcategories:", len(subcategories))

    for sub in subcategories:
        products = scrape_subcategory(sub)
        dataset.extend(products)

    df = pd.DataFrame(dataset)

    print("\nTOTAL ROWS:", len(df))
    df.to_csv("../raw_data.csv", index=False)
    print("Saved raw_data.csv")


if __name__ == "__main__":
    main()


Selected subcategories: 10

Scraping: https://www.industrybuying.com/power-tools-641/cordless-power-tools-16332
  Page: 1
  Page: 2
  Collected: 50

Scraping: https://www.industrybuying.com/power-tools-641/cordless-drills-16333
  Page: 1
  Page: 2
  Collected: 50

Scraping: https://www.industrybuying.com/power-tools-641/battery-chargers-17945
  Page: 1
  Page: 2
  Collected: 50

Scraping: https://www.industrybuying.com/power-tools-641/cordless-impact-wrench-17950
  Page: 1
  Page: 2
  Collected: 50

Scraping: https://www.industrybuying.com/power-tools-641/building-construction-19707
  Page: 1
  Page: 2
  Collected: 50

Scraping: https://www.industrybuying.com/power-tools-641/heavy-duty-lathe-machines-14541
  Page: 1
  Page: 2
  Collected: 50

Scraping: https://www.industrybuying.com/power-tools-641/industrial-drilling-machine-19708
  Page: 1
  Page: 2
  Collected: 34

Scraping: https://www.industrybuying.com/power-tools-641/drain-cleaner-18354
  Page: 1
  Page: 2
  Collected: 50

Scrap

In [45]:
df