In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from google.colab import files
import time
from datetime import datetime
import os
import psutil

# Fetch the webpage
url_template = "https://pgmall.my/category?path=1&page={}"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

performance_log_file = "performance_log.csv"
max_data_limit = 200000
version_label = "Without Optimization"

# Performance tracking setup
process = psutil.Process(os.getpid())
start_cpu = psutil.cpu_percent(interval=1)
start_mem = process.memory_info().rss / (1024 * 1024)  # MB

def scrape_page(page_number):
    """Scrape a single page with retry logic"""
    url = url_template.format(page_number)
    retries = 3

    for attempt in range(retries):
      response = requests.get(url, headers=headers)
      if response.status_code == 429:
          retry_after = int(response.headers.get('Retry-After', 5))
          print(f"Rate limited on page {page_number}, retrying in {retry_after} seconds... (Attempt {attempt+1}/{retries})")
          time.sleep(retry_after)
          continue
      elif response.status_code != 200:
          print(f"Failed to fetch page {page_number}. Status code: {response.status_code}")
          return []

      soup = BeautifulSoup(response.content, 'html.parser')
      listings = soup.find_all('div', class_='category_product_col_new p-div')

      data = []
      for listing in listings:
          # Extract the product link
          link_tag = listing.find('a', href=True)
          link = link_tag['href'] if link_tag else None

          # Extract the product name
          name_tag = listing.find('p', class_='p-name text-left text-darkgrey')
          product_name = name_tag.text.strip() if name_tag else None

          # Extract price
          price_tag = listing.find('span', class_='p-price-red p-overflow')
          price = price_tag.text.strip() if price_tag else None

          # Extract location
          location_tag = listing.find('div', class_='text-left color-grey')
          location = location_tag.text.strip() if location_tag else None

          # Append the extracted data to the list
          data.append({
              "product_name": product_name,
              "link": link,
              "price": price,
              "location": location,
          })

      return data

def save_to_csv(data, csv_file="Item_list.csv"):
    file_exists = os.path.isfile(csv_file)
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        if not file_exists:
            writer.writeheader()
        writer.writerows(data)

def log_performance(version, total_data_saved, elapsed_time, start_cpu, end_cpu, start_mem, end_mem):
    throughput = total_data_saved / elapsed_time if elapsed_time > 0 else 0
    with open(performance_log_file, "a", newline="") as f:
        writer = csv.writer(f)
        if f.tell() == 0:
            writer.writerow(["version", "total_records", "total_time", "cpu_start", "cpu_end", "mem_start", "mem_end", "throughput"])
        writer.writerow([
            version,
            total_data_saved,
            elapsed_time,
            start_cpu,
            end_cpu,
            start_mem,
            end_mem,
            throughput
        ])

def main():
    page_number = 1
    total_data_saved = 0
    request_delay = 0
    start_time = time.time()

    while True:
        print(f"Scraping page {page_number}...")
        property_data = scrape_page(page_number)

        try:
          if not property_data:
              print(f"No data found on page {page_number}. Exiting...")
              break

          save_to_csv(property_data)
          total_data_saved += len(property_data)
          print(f"Page {page_number} scraped and saved ({len(property_data)} items). Total: {total_data_saved}")
          if total_data_saved >= max_data_limit:
              print(f"Reached the max data limit of {max_data_limit}. Exiting...")
              break

          page_number += 1
          time.sleep(request_delay)
        except Exception as e:
          print(f"Error scraping page {page_number}: {e}")
          break

    end_time = time.time()
    elapsed_time = end_time - start_time

    # End performance stats
    end_cpu = psutil.cpu_percent(interval=1)
    end_mem = process.memory_info().rss / (1024 * 1024)

    # Print results
    print(f"\nTotal records scraped: {total_data_saved}")
    print(f"Total time taken: {elapsed_time:.2f} seconds")
    print(f"Start memory: {start_mem:.2f} MB, End memory: {end_mem:.2f} MB")
    print(f"Start CPU: {start_cpu}%, End CPU: {end_cpu}%")
    print(f"Records per second: {total_data_saved / elapsed_time:.2f} rec/sec")

    # Log to performance file
    log_performance(version_label, total_data_saved, elapsed_time, start_cpu, end_cpu, start_mem, end_mem)

        # Final cleanup and download (only in Google Colab)
    if os.path.isfile(csv_file):
        if 'google.colab' in str(get_ipython()):
            files.download(csv_file)
            if os.path.isfile(performance_log_file):
              files.download(performance_log_file)
        else:
            print(f"CSV file saved locally: {csv_file}")
    else:
        print("No data was scraped. CSV file was not created.")

if __name__ == "__main__":
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Page 1506 scraped and saved (50 items). Total: 75228
Scraping page 1507...
Page 1507 scraped and saved (50 items). Total: 75278
Scraping page 1508...
Page 1508 scraped and saved (50 items). Total: 75328
Scraping page 1509...
Page 1509 scraped and saved (50 items). Total: 75378
Scraping page 1510...
Page 1510 scraped and saved (50 items). Total: 75428
Scraping page 1511...
Page 1511 scraped and saved (50 items). Total: 75478
Scraping page 1512...
Page 1512 scraped and saved (50 items). Total: 75528
Scraping page 1513...
Page 1513 scraped and saved (50 items). Total: 75578
Scraping page 1514...
Page 1514 scraped and saved (50 items). Total: 75628
Scraping page 1515...
Page 1515 scraped and saved (50 items). Total: 75678
Scraping page 1516...
Page 1516 scraped and saved (50 items). Total: 75728
Scraping page 1517...
Page 1517 scraped and saved (50 items). Total: 75778
Scraping page 1518...
Page 1518 scraped and saved (49 ite

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>