In [2]:
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import csv, pandas as pd 
import logging
import time

In [3]:

all_links = []
base_url = 'https://meghdadit.com/productlist/laptop/?im=true&page=' 

def scrape_page(page_number):
    try:
    
        global all_links  # Declare global if you intend to modify it
        url = base_url + str(page_number)
        print(f"Scraping page {page_number}")  # Print the current page being scraped
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', {'class': 'd-block'})]
        all_links += links
        print(f"Completed scraping for page {page_number}")
    except Exception as e:
        print(f"An error occurred on page {page_number}: {e}")

pages = 20  # Total number of pages to scrape
with ThreadPoolExecutor(max_workers=50) as executor:
    executor.map(scrape_page, range(1, pages + 1))
    
with open('links.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for link in all_links:
        writer.writerow([link])

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Completed scraping for page 19
Completed scraping for page 11
Completed scraping for page 20
Completed scraping for page 8
Completed scraping for page 5
Completed scraping for page 14
Completed scraping for page 1
Completed scraping for page 12
Completed scraping for page 4
Completed scraping for page 17
Completed scraping for page 6
Completed scraping for page 9
Completed scraping for page 3
Completed scraping for page 7
Completed scraping for page 16
Completed scraping for page 15
Completed scraping for page 13
Completed scraping for page 2
Completed scraping for page 10
Completed scraping for page 18


In [None]:
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import csv
import logging

# Initialize logging
logging.basicConfig(filename='scraping_log.log', level=logging.INFO)

# Create a set to store all possible attributes
all_possible_attributes = set()

def scrape_laptop_details(link, index):
    try:
        print(f"Scraping laptop #{index} details from {link}")
        logging.info(f"Scraping laptop #{index} details from {link}")
        
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        parent = soup.find('div', {'class': 'product-tab-body main-line-height'})
        
        if parent is None:
            raise Exception("Parent element not found")
        
        attributes = parent.find_all('div', {'class': 'attribute-caption'})
        
        if len(attributes) == 0:
            raise Exception("No laptop details found")
        
        laptop_details = {}
        for attribute in attributes:
            attribute_name = attribute.text.strip()
            all_possible_attributes.add(attribute_name)  # Add attribute to the global set
            value_element = attribute.find_next_sibling('div', {'class': 'attribute-value'})
            value = value_element.text.strip() if value_element else 'Unknown'
            laptop_details[attribute_name] = value

        # Fill in 'Unknown' for missing attributes
        for missing_attr in all_possible_attributes - set(laptop_details.keys()):
            laptop_details[missing_attr] = 'Unknown'
        
        # Write to CSV
        with open('laptop_details.csv', 'a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=list(all_possible_attributes))
            if file.tell() == 0:
                writer.writeheader()
            writer.writerow(laptop_details)

        print(f"Completed scraping laptop #{index} details from {link}")
        logging.info(f"Completed scraping laptop #{index} details from {link}")

    except Exception as e:
        print(f"An error occurred while scraping laptop #{index} details from {link}: {e}")
        logging.error(f"An error occurred while scraping laptop #{index} details from {link}: {e}")

if __name__ == '__main__':
    with open('links.csv', 'r') as file:
        links = ['https://meghdadit.com'+row[0] for row in csv.reader(file)]
    
    with open('laptop_details.csv', 'w', newline='', encoding='utf-8') as file:
        pass
    
    with ThreadPoolExecutor(max_workers=20) as executor:
        executor.map(scrape_laptop_details, links, range(1, len(links) + 1))


Scraping laptop #1 details from https://meghdadit.com/myprofile/
Scraping laptop #2 details from https://meghdadit.com/product/134503/hp-fd0237nia-c-core-i7-1355u-8gb-1tb-ssd-2gb-full-hd-laptop/
Scraping laptop #3 details from https://meghdadit.com/product/123835/hp-omen-16-c0002dx-a-ryzen-7-5800h-16gb-1tb-ssd-8gb-rx6600m-full-hd-ips-laptop/
Scraping laptop #4 details from https://meghdadit.com/product/123845/hp-omen-16-c0002dx-b-ryzen-7-5800h-32gb-1tb-ssd-8gb-rx6600m-full-hd-ips-laptop/
Scraping laptop #5 details from https://meghdadit.com/product/123560/microsoft-surface-laptop-studio-14-4inch-core-i7-11370h-32gb-2tb-ssd-4gb-rtx-a20/
Scraping laptop #6 details from https://meghdadit.com/product/123563/microsoft-surface-laptop-studio-14-4inch-core-i7-11370h-16gb-512gb-ssd-4gb-rtx-3/
Scraping laptop #7 details from https://meghdadit.com/product/123570/microsoft-surface-laptop-studio-14-4inch-core-i7-11370h-32gb-2tb-ssd-4gb-rtx-305/
Scraping laptop #8 details from https://meghdadit.com/

In [4]:
# all_laptop_details = []

# def scrape_single_laptop(link):
#     try:
#         print(f"Scraping laptop details from {link}")
        
#         response = requests.get(link)
#         if response.status_code != 200:
#             raise Exception(f"Failed to fetch the page: {response.status_code}")
        
#         soup = BeautifulSoup(response.text, 'html.parser')
        
#         parent = soup.find('div', {'class': 'product-tab-body main-line-height'})
#         if parent is None:
#             raise Exception("Parent element not found")
        
#         attributes = parent.find_all('div', {'class': 'attribute-caption'})
#         if len(attributes) == 0:
#             raise Exception("No laptop details found")
        
#         laptop_details = {}
#         for attribute in attributes:
#             attribute_name = attribute.text.strip()
#             value_element = attribute.find_next_sibling('div', {'class': 'attribute-value'})
#             value = value_element.text.strip() if value_element else 'Unknown'
#             laptop_details[attribute_name] = value
            
#         print("Scraped laptop details:", laptop_details)
#         all_laptop_details.append(laptop_details)
#     except Exception as e:
#         print(f"An error occurred: {e}")


# # Let's test the function with a single link
# test_link = ' https://meghdadit.com/product/123560/microsoft-surface-laptop-studio-14-4inch-core-i7-11370h-32gb-2tb-ssd-4gb-rtx-a20/'  # Replace with one of your actual links
# scrape_single_laptop(test_link)
# # Write all laptop details to a CSV file
# keys = all_laptop_details[0].keys()  # Assuming the first dictionary has all the keys
# with open('laptop_details.csv', 'w', newline='', encoding='utf-8') as file:
#     writer = csv.DictWriter(file, fieldnames=keys)
#     writer.writeheader()
#     writer.writerows(all_laptop_details)

Scraping laptop details from  https://meghdadit.com/product/123560/microsoft-surface-laptop-studio-14-4inch-core-i7-11370h-32gb-2tb-ssd-4gb-rtx-a20/
Scraped laptop details: {'ابعاد': '323.28 * 228.32 * 18.94 میلی\u200cمتر', 'وزن': '1.8 کیلوگرم', 'نوع دستگاه': 'نوت بوک (لپ تاپ)', 'اندازه صفحه نمایش': '14.4 اینچ', 'نوع صفحه نمایش': 'PixelSense™ Display', 'رزولوشن صفحه نمایش': '1600 * 2560', 'صفحه نمایش لمسی': '', 'توضیحات صفحه نمایش': 'نسبت تصویر 3 به 2Dolby Vision® support', 'سازنده پردازنده': 'INTEL', 'سری پردازنده': 'Core i7', 'مدل پردازنده': 'i7-11370H', 'فرکانس پردازنده': '3.3 گیگاهرتز', 'فرکانس پردازنده در حالت توربو': '4.8 گیگاهرتز', 'حافظه  کش پردازنده': '12  مگابایت', 'حافظه رم': '32 گیگابایت', 'نوع حافظه رم': 'DDR4', 'ظرفیت حافظه داخلی': '2 ترابایت', 'نوع حافظه داخلی': 'SSD', 'سازنده کارت گرافیک': 'Nvidia Gforce', 'مدل کارت گرافیک': 'RTX™ A2000', 'حافظه اختصاصی گرافیکی': '4 گیگابایت', 'درگاه USB-C': '2', 'درگاه VGA': '', 'درگاه HDMI': '', 'درگاه Thunderbolt': '', 'اتصال Wi-Fi':

In [5]:
def extract_laptop_info_from_url(url):
    # Split the URL into parts
    parts = url.split('/')

    # The manufacturer and model are in the 5th part of the URL
    # Split this part into words
    words = parts[5].split('-')

    # The manufacturer is the first word and the model is the second word
    manufacturer = words[0]
    model = words[1]

    return manufacturer, model

In [7]:
# Example usage
url = "https://meghdadit.com/product/134506/lenovo-ideapad-slim-3-c-core-i3-1305u-8gb-256gb-ssd-intel-fhd-laptop/"
manufacturer, model = extract_laptop_info_from_url(url)
print(f"Manufacturer: {manufacturer}, Model: {model}")

Manufacturer: lenovo, Model: ideapad
