In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from google.colab import files
import time
import concurrent.futures
from datetime import datetime
import os

# Fetch the webpage
url_template = "https://pgmall.my/category?path={type_id}&page={page}"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

def scrape_page(page_number, product_type, type_id):
    """Scrape a single page with retry logic"""
    url = url_template.format(type_id=type_id, page=page_number)
    retries = 3

    for attempt in range(retries):
        response = requests.get(url, headers=headers)

        if response.status_code == 429:
            retry_after = int(response.headers.get('Retry-After', 5))
            print(f"Rate limited on page {page_number}, retrying in {retry_after} seconds... (Attempt {attempt+1}/{retries})")
            time.sleep(retry_after)
            continue
        elif response.status_code != 200:
            print(f"Failed to fetch page {page_number}. Status code: {response.status_code}")
            return []

        # Parse and extract data
        soup = BeautifulSoup(response.content, 'html.parser')
        listings = soup.find_all('div', class_='category_product_col_new p-div')

        data = []
        for listing in listings:
            # Extract the product link
            link_tag = listing.find('a', href=True)
            link = link_tag['href'] if link_tag else None

            # Extract the product name
            name_tag = listing.find('p', class_='p-name text-left text-darkgrey')
            product_name = name_tag.text.strip() if name_tag else None

            # Append the extracted data to the list
            data.append({
                "product_name": product_name,
                "link": link,
            })

        return data

def save_to_csv(data, csv_file):
    """Save data to CSV"""
    # Check if the file exists and has headers
    file_exists = os.path.isfile(csv_file)

    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        if not file_exists:
            writer.writeheader()
        writer.writerows(data)

# Function with rate-limiting and data limit
def scrape_product_type(product_type, type_id):
    page_number = 1
    max_workers = 5  
    request_delay = 0  
    max_data_limit = 200000  
    total_data_saved = 0  

    csv_file = f"Item_list_{product_type}.csv"

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        while True:
            print(f"Scraping page {page_number}...")
            future = executor.submit(scrape_page, page_number, product_type, type_id)

            try:
              item_data = future.result()
              if not item_data:
                  print(f"No data found on page {page_number}. Exiting...")
                  break

              # Save scraped data to CSV
              save_to_csv(item_data, csv_file)

              total_data_saved += len(item_data)
              print(f"Page {page_number} scraped and saved ({len(item_data)} items). Total: {total_data_saved}")
              # Check if the data limit has been reached
              if total_data_saved >= max_data_limit:
                  print(f"Reached the maximum data limit of {max_data_limit}. Exiting...")
                  break

              page_number += 1
              time.sleep(request_delay)
            except Exception as e:
                print(f"Error scraping page {page_number}: {e}")
                break

    # Final cleanup and download (only in Google Colab)
    if os.path.isfile(csv_file):
        if 'google.colab' in str(get_ipython()):
            files.download(csv_file)
        else:
            print(f"CSV file saved locally: {csv_file}")
    else:
        print("No data was scraped. CSV file was not created.")

if __name__ == "__main__":
  product_categories = [
  ("Kosmetik kanak-kanak", 1592),
  ("Alat solek", 103),
  ("Bekalan Perubatan", 124),
  ("Penjagaan diri", 46),
  ("Penjagaan Wanita", 50),
  ("Produk Dewasa", 53),
  ("Penjagaan Mata", 57),
  ("Penjagaan mulut", 61),
  ("Penjagaan rambut", 69),
  ("Mandian", 80),
  ("Wangian", 89),
  ("Penjagaan kulit", 93),
  ("Makanan Tambahan", 133),
  ("Alat Kecantikan", 150)
  ]

  for product_type, type_id in product_categories:
    scrape_product_type(product_type, type_id)

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 150
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 200
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 250
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 300
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 350
Scraping page 8...
Page 8 scraped and saved (25 items). Total: 375
Scraping page 9...
No data found on page 9. Exiting...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 150
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 200
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 250
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 300
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 350
Scraping page 8...
Page 8 scraped and saved (50 items). Total: 400
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 450
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 500
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 550
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 600
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 650
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 700
Scraping page 15...
Page 15 scraped and saved (50 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 150
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 200
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 250
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 300
Scraping page 7...
Page 7 scraped and saved (49 items). Total: 349
Scraping page 8...
Page 8 scraped and saved (49 items). Total: 398
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 448
Scraping page 10...
Page 10 scraped and saved (49 items). Total: 497
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 547
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 597
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 647
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 697
Scraping page 15...
Page 15 scraped and saved (50 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 150
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 200
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 250
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 300
Scraping page 7...
Page 7 scraped and saved (48 items). Total: 348
Scraping page 8...
Page 8 scraped and saved (49 items). Total: 397
Scraping page 9...
Page 9 scraped and saved (46 items). Total: 443
Scraping page 10...
Page 10 scraped and saved (47 items). Total: 490
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 540
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 590
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 640
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 690
Scraping page 15...
Page 15 scraped and saved (50 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 150
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 200
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 250
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 300
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 350
Scraping page 8...
Page 8 scraped and saved (50 items). Total: 400
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 450
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 500
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 550
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 600
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 650
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 700
Scraping page 15...
Page 15 scraped and saved (50 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (49 items). Total: 49
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 99
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 149
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 199
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 249
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 299
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 349
Scraping page 8...
Page 8 scraped and saved (50 items). Total: 399
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 449
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 499
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 549
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 599
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 649
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 699
Scraping page 15...
Page 15 scraped and saved (50 item

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (49 items). Total: 149
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 199
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 249
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 299
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 349
Scraping page 8...
Page 8 scraped and saved (50 items). Total: 399
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 449
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 499
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 549
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 599
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 649
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 699
Scraping page 15...
Page 15 scraped and saved (50 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (48 items). Total: 98
Scraping page 3...
Page 3 scraped and saved (45 items). Total: 143
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 193
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 243
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 293
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 343
Scraping page 8...
Page 8 scraped and saved (49 items). Total: 392
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 442
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 492
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 542
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 592
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 642
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 692
Scraping page 15...
Page 15 scraped and saved (50 item

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (49 items). Total: 49
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 99
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 149
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 199
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 249
Scraping page 6...
Page 6 scraped and saved (49 items). Total: 298
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 348
Scraping page 8...
Page 8 scraped and saved (50 items). Total: 398
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 448
Scraping page 10...
Page 10 scraped and saved (49 items). Total: 497
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 547
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 597
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 647
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 697
Scraping page 15...
Page 15 scraped and saved (50 item

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (47 items). Total: 47
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 97
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 147
Scraping page 4...
Page 4 scraped and saved (45 items). Total: 192
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 242
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 292
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 342
Scraping page 8...
Page 8 scraped and saved (48 items). Total: 390
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 440
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 490
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 540
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 590
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 640
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 690
Scraping page 15...
Page 15 scraped and saved (50 item

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (49 items). Total: 149
Scraping page 4...
Page 4 scraped and saved (49 items). Total: 198
Scraping page 5...
Page 5 scraped and saved (48 items). Total: 246
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 296
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 346
Scraping page 8...
Page 8 scraped and saved (50 items). Total: 396
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 446
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 496
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 546
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 596
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 646
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 696
Scraping page 15...
Page 15 scraped and saved (50 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (49 items). Total: 99
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 149
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 199
Scraping page 5...
Page 5 scraped and saved (47 items). Total: 246
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 296
Scraping page 7...
Page 7 scraped and saved (48 items). Total: 344
Scraping page 8...
Page 8 scraped and saved (49 items). Total: 393
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 443
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 493
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 543
Scraping page 12...
Page 12 scraped and saved (49 items). Total: 592
Scraping page 13...
Page 13 scraped and saved (48 items). Total: 640
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 690
Scraping page 15...
Page 15 scraped and saved (50 item

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (50 items). Total: 50
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 100
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 150
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 200
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 250
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 300
Scraping page 7...
Page 7 scraped and saved (49 items). Total: 349
Scraping page 8...
Page 8 scraped and saved (50 items). Total: 399
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 449
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 499
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 549
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 599
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 649
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 699
Scraping page 15...
Page 15 scraped and saved (50 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping page 1...
Page 1 scraped and saved (49 items). Total: 49
Scraping page 2...
Page 2 scraped and saved (50 items). Total: 99
Scraping page 3...
Page 3 scraped and saved (50 items). Total: 149
Scraping page 4...
Page 4 scraped and saved (50 items). Total: 199
Scraping page 5...
Page 5 scraped and saved (50 items). Total: 249
Scraping page 6...
Page 6 scraped and saved (50 items). Total: 299
Scraping page 7...
Page 7 scraped and saved (50 items). Total: 349
Scraping page 8...
Page 8 scraped and saved (49 items). Total: 398
Scraping page 9...
Page 9 scraped and saved (50 items). Total: 448
Scraping page 10...
Page 10 scraped and saved (50 items). Total: 498
Scraping page 11...
Page 11 scraped and saved (50 items). Total: 548
Scraping page 12...
Page 12 scraped and saved (50 items). Total: 598
Scraping page 13...
Page 13 scraped and saved (50 items). Total: 648
Scraping page 14...
Page 14 scraped and saved (50 items). Total: 698
Scraping page 15...
Page 15 scraped and saved (50 item

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>