In [3]:
import requests
from bs4 import BeautifulSoup
import json
import os

ModuleNotFoundError: No module named 'requests'

Modularized main and scrape functionality


In [8]:
def fetch_product_details(url):
    # Send a request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags with the class 'placeholder2' directly
    bullet_points = soup.find_all('li', class_='css-e3648i')

    # Extract and clean the text content of each <li>
    description = [bullet.get_text(strip=True) for bullet in bullet_points]

    return description


def fetch_tech_features(url):
    # Send a request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags with the class 'placeholder2' directly
    tech_specs = soup.find_all('dl', class_='css-111n9vg')

    output = []
    for box in tech_specs:
      main_feature = box.find_all('div', class_='css-j6qv4f')
      sub_features = box.find_all('div',class_='css-37f4v5')

      for feature in main_feature:
        main_key = feature.find('dt', class_='chakra-text css-cetpsl')
        main_value = feature.find('dd', class_='chakra-text css-1x5aigl')

        output.append(main_key.get_text(strip=True)+': '+main_value.get_text(strip=True))

      for feature in sub_features:
        sub_key = feature.find('dt', class_='chakra-text css-1apn5qx')
        sub_value = feature.find('dd', class_='chakra-text css-rsyz82')

        output.append(sub_key.get_text(strip=True)+ ': ' + sub_value.get_text(strip=True))

    return output


def scrape_product_info(url, save_path="product_info.json"):

    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all product containers
    product_containers = soup.find_all('div', {'data-id': 'PLI'})

    output = []
    # Extract product names
    for container in product_containers:
        brand_name = container.find('p', {'data-id': 'brandName'})
        title_tag = container.find('h2', {'data-id': 'title'})
        price_range = container.find('span', {'data-id': 'price'})
        color_counts = container.find('p', {'data-id': 'colorsCount'})
        product_link = container.find('a', class_='chakra-linkbox__overlay css-1uw88nq')
        full_product_link = 'https://www.backcountry.com' + product_link['href'] if product_link else 'N/A'

        description = fetch_product_details(full_product_link)
        tech_features = fetch_tech_features(full_product_link)

        output.append({
            'Item_Name': title_tag.get_text(strip=True),
            'Brand_Name': brand_name.get_text(strip=True),
            'price_range': price_range.get_text(strip=True),
            'color_counts': color_counts.get_text(strip=True),
            'description': description,
            'tech_features': tech_features
        })

    with open(save_path, "w") as file:
        json.dump(output, file, indent=2)



def scrape_all_categories():
  categories = categories = [
        'mens-clothing',
        'mens-footwear',
        'mens-accessories',
        'womens-clothing',
        'womens-footwear',
        'womens-accessories'
    ]

  base_url = 'https://www.backcountry.com/cat/'

  for category in categories:

    # Build link name and filename to write
    full_url = base_url + category
    file_name = f"{category}.json"

    # Scrape and save to file
    scrape_product_info(full_url, save_path=file_name)

    # Count number of products written in the JSON file
    if os.path.exists(file_name):
      with open(file_name, "r") as f:
        data = json.load(f)
        print(f"{file_name} has {len(data)} items.")
    else:
      print(f"{file_name} not found.")



if __name__ == "__main__":
    scrape_all_categories()



mens-clothing.json has 42 items.
mens-footwear.json has 42 items.
mens-accessories.json has 42 items.
womens-clothing.json has 42 items.
womens-footwear.json has 42 items.
womens-accessories.json has 42 items.


# JSON Output

In [3]:
def fetch_product_details(url):
    # Send a request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags with the class 'placeholder2' directly
    bullet_points = soup.find_all('li', class_='css-e3648i')

    # Extract and clean the text content of each <li>
    description = [bullet.get_text(strip=True) for bullet in bullet_points]

    return description


def fetch_tech_features(url):
    # Send a request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags with the class 'placeholder2' directly
    tech_specs = soup.find_all('dl', class_='css-111n9vg')

    output = []
    for box in tech_specs:
      main_feature = box.find_all('div', class_='css-j6qv4f')
      sub_features = box.find_all('div',class_='css-37f4v5')

      for feature in main_feature:
        main_key = feature.find('dt', class_='chakra-text css-cetpsl')
        main_value = feature.find('dd', class_='chakra-text css-1x5aigl')

        output.append(main_key.get_text(strip=True)+': '+main_value.get_text(strip=True))

      for feature in sub_features:
        sub_key = feature.find('dt', class_='chakra-text css-1apn5qx')
        sub_value = feature.find('dd', class_='chakra-text css-rsyz82')

        output.append(sub_key.get_text(strip=True)+ ': ' + sub_value.get_text(strip=True))

    return output

# Main URL
url = 'https://www.backcountry.com/cat/mens-clothing'

response = requests.get(url)
response.raise_for_status()


# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Find all product containers
product_containers = soup.find_all('div', {'data-id': 'PLI'})


output = []
# Extract product names
for container in product_containers[:5]:
    brand_name = container.find('p',{'data-id':'brandName'})
    title_tag = container.find('h2', {'data-id':'title'})
    price_range = container.find('span', {'data-id':'price'})
    color_counts = container.find('p', {'data-id':'colorsCount'})
    product_link = container.find('a', class_='chakra-linkbox__overlay css-1uw88nq')
    full_product_link = 'https://www.backcountry.com' + product_link['href'] if product_link else 'N/A'

    description = fetch_product_details(full_product_link)
    tech_features = fetch_tech_features(full_product_link)

    output.append(
          {
            'Item_Name': title_tag.get_text(strip=True),
            'Brand_Name': brand_name.get_text(strip=True),
             'price_range': price_range.get_text(strip=True),
            'color_counts': color_counts.get_text(strip=True),
            'description': description,
            'tech_features': tech_features,
            #'product_link': full_product_link,
          }
        )


# Print output in a structured format
with open("product_info.json", "w") as file:
    json.dump(output, file, indent=2)

# Free Text Output

In [None]:
def fetch_product_details(url):
    '''
    Fetch Product Details from a single product detail page.
    Ex:       "description": [
            "Light, versatile fleece for crisp shoulder season outings",
            "Recycled double-sided fabric gives warming, anti-pilling performance",
            "Snap placket and chest pocket deliver stylish yet functional flair",
            "Elastic binding at cuffs and hem keep cold drafts out"
    '''
    # Send a request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags with the class 'placeholder2' directly
    bullet_points = soup.find_all('li', class_='css-e3648i')

    # Extract and clean the text content of each <li>
    description = [bullet.get_text(strip=True) for bullet in bullet_points]

    return description


def fetch_tech_features(url):
    '''
    Fetch Technical Features from a single product detail page.
    Ex:       "description": [
    '''
    # Send a request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags with the class 'placeholder2' directly
    tech_specs = soup.find_all('dl', class_='css-111n9vg')

    output = []
    for box in tech_specs:
      main_feature = box.find_all('div', class_='css-j6qv4f')
      sub_features = box.find_all('div',class_='css-37f4v5')

      for feature in main_feature:
        main_key = feature.find('dt', class_='chakra-text css-cetpsl')
        main_value = feature.find('dd', class_='chakra-text css-1x5aigl')

        output.append(main_key.get_text(strip=True)+': '+main_value.get_text(strip=True))

      for feature in sub_features:
        sub_key = feature.find('dt', class_='chakra-text css-1apn5qx')
        sub_value = feature.find('dd', class_='chakra-text css-rsyz82')

        output.append(sub_key.get_text(strip=True)+ ': ' + sub_value.get_text(strip=True))

    return output

# Main URL
url = 'https://www.backcountry.com/cat/mens-clothing'

response = requests.get(url)
response.raise_for_status()


# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Find all product containers
product_containers = soup.find_all('div', {'data-id': 'PLI'})


output = ''
# Extract product names
for container in product_containers[:100]:
    brand_name = container.find('p',{'data-id':'brandName'})
    title_tag = container.find('h2', {'data-id':'title'})
    price_range = container.find('span', {'data-id':'price'})
    color_counts = container.find('p', {'data-id':'colorsCount'})
    product_link = container.find('a', class_='chakra-linkbox__overlay css-1uw88nq')
    full_product_link = 'https://www.backcountry.com' + product_link['href'] if product_link else 'N/A'

    description = fetch_product_details(full_product_link)
    description_str = ','.join(description) # Turn List to String

    tech_features = fetch_tech_features(full_product_link)
    tech_features_str = ','.join(tech_features) # Turn List to String

    output += f"""
#Item_Name#: {title_tag.get_text(strip=True)},
#Brand_Name#: {brand_name.get_text(strip=True)},
#price_range#: {price_range.get_text(strip=True)},
#color_counts#: {color_counts.get_text(strip=True)},
#description#: {description_str} ,
#tech_features#: {tech_features_str},
#product_link#: {full_product_link},

    """




# Print output in a structured format
#print(json.dumps(output, indent=4, ensure_ascii=False))
print(output)


#Item_Name#: Lightweight Synchilla Snap-T Fleece Pullover - Men's,
#Brand_Name#: Patagonia,
#price_range#: $69.50 -$139.00,
#color_counts#: 7 colors,
#description#: Light, versatile fleece for crisp shoulder season outings,Recycled double-sided fabric gives warming, anti-pilling performance,Snap placket and chest pocket deliver stylish yet functional flair,Elastic binding at cuffs and hem keep cold drafts out ,
#tech_features#: Responsible Collection: Recycled/Repurposed,Material: [face fabric] 100% recycled polyester, [placket, pocket] 100% recycled nylon, DWR finish, [pocket bag] 100% recycled polyester brushed tricot,Fleece Weight: lightweight,Fit: regular,Length: hip,Center Back Length: [meidum] 28.5in,Hood: none,Pockets: 1 snap-flap chest,Claimed Weight: 13.2oz,Activity: casual, hiking,Manufacturer Warranty: lifetime,
#product_link#: https://www.backcountry.com/patagonia-lightweight-synchilla-snap-t-fleece-pullover-mens,
              
    
#Item_Name#: Concurve 5in 2in1 Short - 

In [None]:
with open("product_info.txt", "w") as file:
    file.write(output)

In [None]:
# URL of the product detail page
url = 'https://www.backcountry.com/patagonia-lightweight-synchilla-snap-t-fleece-pullover-mens'

def fetch_tech_features(url):
    # Send a request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    # Parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags with the class 'placeholder2' directly
    tech_specs = soup.find_all('dl', class_='css-111n9vg')

    output = []
    for box in tech_specs:
      main_feature = box.find_all('div', class_='css-j6qv4f')
      sub_features = box.find_all('div',class_='css-37f4v5')

      for feature in main_feature:
        main_key = feature.find('dt', class_='chakra-text css-cetpsl')
        main_value = feature.find('dd', class_='chakra-text css-1x5aigl')

        output.append({
          main_key.get_text(strip=True) : main_value.get_text(strip=True)
        })

      for feature in sub_features:
        sub_key = feature.find('dt', class_='chakra-text css-1apn5qx')
        sub_value = feature.find('dd', class_='chakra-text css-rsyz82')

        output.append({
          sub_key.get_text(strip=True) : sub_value.get_text(strip=True)
        })

    return output

# Fetch product details
description = fetch_tech_features(url)




# Print the list of bullet points
description


[{'Responsible Collection': 'Recycled/Repurposed'},
 {'Material': '[face fabric] 100% recycled polyester, [placket, pocket] 100% recycled nylon, DWR finish, [pocket bag] 100% recycled polyester brushed tricot'},
 {'Fleece Weight': 'lightweight'},
 {'Fit': 'regular'},
 {'Length': 'hip'},
 {'Center Back Length': '[meidum] 28.5in'},
 {'Hood': 'none'},
 {'Pockets': '1 snap-flap chest'},
 {'Claimed Weight': '13.2oz'},
 {'Activity': 'casual, hiking'},
 {'Manufacturer Warranty': 'lifetime'}]