# Scrapping Amazon Product Page

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver import ChromeOptions
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from time import sleep
import requests
import json
import os

## Load list of product urls

In [None]:
with open("urls.txt",'r') as urls_txt:
    urllist = urls_txt.read().splitlines()
print(urllist)

## One-time setup of Chrome WebDriver

In [None]:
options = ChromeOptions() 
options.headless = False # Due to dynamic elements cannot run headless
driver = webdriver.Chrome(chrome_options=options)

## Iterate through Product List Scraping Data

In [None]:
with open('output.jsonl','w') as output:
    
    for url in urllist:
        
        # Product url and asin
        print("Product url: " + url)
        asin = url[-10:]
        print("Product sku: " + asin)
        
        # Load product page
        driver.get(url)

        # Click all image/video thumbnails to load dynamic elements
        alt_imgs = driver.find_element(By.XPATH, "//div[@id='altImages']")
        lst_item = alt_imgs.find_elements(By.XPATH, ".//li[contains(@class, 'imageThumbnail') or contains(@class, 'videoThumbnail')]//input")
        for item in lst_item:
            item.click()
            sleep(0.5)
        
        # Collect image/video urls for posterior download
        lst_imgs = driver.find_elements(By.XPATH, "//div[@class='imgTagWrapper']/img")
        try:
            video = driver.find_element(By.XPATH, "//div[@id='main-video-container']//video")
            lst_imgs.append(video)
        except NoSuchElementException:
            print("This product has no associated video")
        lst_srcs = [img.get_attribute('src') for img in lst_imgs]
        print(lst_srcs)
        print("List size: ", len(lst_srcs))

        # Get more product data: title, price, bullets
        title = driver.find_element(By.XPATH, "//span[@id='productTitle']").get_attribute("innerHTML").strip()
        print(title)
        price = driver.find_element(By.XPATH, "//span[@class='a-offscreen']").get_attribute("innerHTML").strip()
        print(price)
        lst_bullets = driver.find_elements(By.XPATH, "//div[@id='feature-bullets']//li/span")
        print(f'Number of span-bullets: {len(lst_bullets)}')
        
        # Create list of bullet points describing the product. 
        # Remove whatever spurious html element is left in the text
        lst_bullets_text = []
        for bul in lst_bullets:
            b = bul.get_attribute('innerHTML').strip()
            soup = BeautifulSoup(b, 'html.parser')
            lst_bullets_text.append(soup.get_text())
            #lst_bullets_text = [bul.get_attribute('innerHTML').strip() for bul in lst_bullets]
        print(lst_bullets_text)

        # Create object and write to jsonl file
        product = {
            'title': title,
            'asin': asin,
            'price': price,
            'bullets': lst_bullets_text,
            'media': lst_srcs
        }

        # Serialize the object as JSON and write it to the file
        json.dump(product, output)
        # Write a newline character after each object
        output.write('\n')

        # Create folders for each product and write metadata file
        if not os.path.exists(asin):
            os.mkdir(asin)
        else:
            print("Folder already exists!")               
        prod_file = asin + '/' + asin + '.json' 
        with open(prod_file, 'w') as pf:    
            json.dump(product, pf)
        
        # Download product media files to folder
        for url in lst_srcs: 
            r = requests.get(url)
            if r.status_code == 200:
                fpath = asin + '/' + url.split("/")[-1].split("?")[0]
                with open(fpath, 'wb') as fm:
                    fm.write(r.content)
            else:
                print("Error downloading media") 

In [None]:
driver.quit()