In [None]:
# https://www.amazon.com/gp/navigation-country/select-country
!pip install selenium
!apt install chromium-chromedriver
!pip install requests-html

In [None]:
from pathlib import Path
import re
import requests
import time
import datetime

import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [None]:
product_category_links_output = "/content/category-products.csv"
product_output = "/content/products.csv"

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')

driver = webdriver.Chrome('chromedriver', options=options)

In [None]:
categories = [
    {"name": "toys-and-games", "url": "https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/"},
    {"name": "electronics", "url": "https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/"},
    {"name": "video-games", "url": "https://www.amazon.com/best-sellers-video-games/zgbs/videogames/"},
    {"name": "books", "url": "https://www.amazon.com/best-sellers-books/zgbs/books/"}
]

In [None]:
categories

[{'name': 'toys-and-games',
  'url': 'https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/'},
 {'name': 'electronics',
  'url': 'https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/'},
 {'name': 'video-games',
  'url': 'https://www.amazon.com/best-sellers-video-games/zgbs/videogames/'},
 {'name': 'books',
  'url': 'https://www.amazon.com/best-sellers-books/zgbs/books/'}]

In [None]:
regex_options = [
    r"https://www.amazon.com/gp/product/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [None]:
def clean_page_links(page_links=[], category=None):
    page_links = [x.split("ref", 1)[0] for x in page_links]
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id, "category": category})
    return final_page_links

In [None]:
def scrape_category_product_links(categories=[]):
    all_product_links = []
    for category in categories:
        time.sleep(1.5)
        url = category.get("url")
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html=html_str)
        page_links = [f"https://www.amazon.com{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links

In [None]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [None]:
extract_categories_and_save(categories=categories)

In [None]:
def scrape_product_page(url, title_lookup = "#productTitle", price_lookup = "#priceblock_ourprice"):
    driver.get(url)
    time.sleep(1.5)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [None]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            "url": link,
            "product_id": product_id,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [None]:
#extracted_data = perform_scrape(cleaned_items=cleaned_links)

In [None]:
#print(extracted_data)

In [None]:
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    # print(link)
    if scraped == 1 or scraped == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    #row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price)
    return row

In [None]:
df = pd.read_csv(product_category_links_output)
df.head()

Unnamed: 0,url,product_id,category
0,https://www.amazon.com/Click-Play-Phthalate-Cr...,B00PYLU3GG,"{'name': 'toys-and-games', 'url': 'https://www..."
1,https://www.amazon.com/Cards-Against-Humanity-...,B004S8F7QM,"{'name': 'toys-and-games', 'url': 'https://www..."
2,https://www.amazon.com/Kids-Against-Maturity-O...,B076PRWVFG,"{'name': 'toys-and-games', 'url': 'https://www..."
3,https://www.amazon.com/Mattel-Games-Official-A...,B07P6MZPK3,"{'name': 'toys-and-games', 'url': 'https://www..."
4,https://www.amazon.com/Flybar-Jumper-Toddlers-...,B00WTDXSDM,"{'name': 'toys-and-games', 'url': 'https://www..."


In [None]:
df.shape

(203, 3)

In [None]:
df_sub = df.copy() # df.head(n=10)

In [None]:
df_sub = df_sub.apply(row_scrape_event, axis=1)

https://www.amazon.com/Click-Play-Phthalate-Crush-Plastic/dp/B00PYLU3GG/ Click N' Play Pack of 200 Phthalate Free BPA Free Crush Proof Plastic Ball, Pit Balls - 6 Bright Colors in Reusable and Durable Storage Mesh Bag with Zipper $27.79
https://www.amazon.com/Cards-Against-Humanity-LLC-CAHUS/dp/B004S8F7QM/ Cards Against Humanity $25.00
https://www.amazon.com/Kids-Against-Maturity-Original-Humanity/dp/B076PRWVFG/ Kids Against Maturity: Card Game for Kids and Families, Super Fun Hilarious for Family Party Game Night $29.99
https://www.amazon.com/Mattel-Games-Official-Amazon-Exclusive/dp/B07P6MZPK3/ Mattel Games UNO $9.99
https://www.amazon.com/Flybar-Jumper-Toddlers-Durable-Supports/dp/B00WTDXSDM/ Flybar My First Foam Pogo Jumper for Kids Fun and Safe Pogo Stick for Toddlers, Durable Foam and Bungee Jumper for Ages 3 and up, Supports up to 250lbs $16.99
https://www.amazon.com/Fisher-Price-Rattle-Rock-Maracas-Orange/dp/B071FLSWBS/ Fisher-Price Rattle 'n Rock Maracas, Blue/Orange [Amazon E

In [None]:
df.to_csv(product_output, index=False)

In [None]:
products_df = pd.read_csv(product_output)

In [None]:
final_df = pd.concat([products_df, df_sub])
final_df.to_csv(product_output, index=False)

In [None]:
final_df.head()

Unnamed: 0,url,product_id,category,title,price,scraped
0,https://www.amazon.com/Click-Play-Phthalate-Cr...,B00PYLU3GG,"{'name': 'toys-and-games', 'url': 'https://www...",,,
1,https://www.amazon.com/Cards-Against-Humanity-...,B004S8F7QM,"{'name': 'toys-and-games', 'url': 'https://www...",,,
2,https://www.amazon.com/Kids-Against-Maturity-O...,B076PRWVFG,"{'name': 'toys-and-games', 'url': 'https://www...",,,
3,https://www.amazon.com/Mattel-Games-Official-A...,B07P6MZPK3,"{'name': 'toys-and-games', 'url': 'https://www...",,,
4,https://www.amazon.com/Flybar-Jumper-Toddlers-...,B00WTDXSDM,"{'name': 'toys-and-games', 'url': 'https://www...",,,
