# Amazon Web Scraper

### Requirements

- Beautiful Soup

In [41]:
import csv
from bs4 import BeautifulSoup
import requests
from selenium import webdriver

## Start up the webdriver

In [47]:
driver = webdriver.Firefox(executable_path=r'/Users/michael/Downloads/geckodriver')
driver.get('https://www.amazon.com/s?k=tv&ref=nb_sb_noss')

In [48]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss'
    search_term = search_term.replace(' ', '+')
    return(template.format(search_term))

In [49]:
get_url('tv')

'https://www.amazon.com/s?k=tv&ref=nb_sb_noss'

## Extract the collection

In [51]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [54]:
results = soup.find_all('div', {'data-component-type': 's-search-result'})

## Prototype the record

In [55]:
item = results[0]

In [56]:
atag = item.h2.a

In [60]:
description = atag.text

In [59]:
url = 'https://www.amazon.com' + atag.get('href')

In [61]:
price_parent = item.find('span', class_ = 'a-price')
price = price_parent.find('span', class_ = 'a-offscreen').text
print(price)

$1,199.99


In [62]:
price = item.find('span', class_ = 'a-price').find('span', class_ = 'a-offscreen').text
print(price)

$1,199.99


## Generalize the pattern

In [63]:
def extract_record(item):
    '''
    Extract and return data from a single record.
    Parameters:
    ---------------------------------------------
    item: one search result from the search term.
    '''
    
    #description and url
    atag = item.h2.a
    description = atag.text
    url = 'https://www.amazon.com' + atag.get('href')
    
    #Price
    price = item.find('span', class_ = 'a-price').find('span', class_ = 'a-offscreen').text
    
    #Tuple of description, url, and price
    results = (description, url, price)
    return results

In [64]:
records = []
results = results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
    records.append(extract_record(item))
    

AttributeError: 'NoneType' object has no attribute 'find'

## Error Handling

In [66]:
def extract_record(item):
    '''
    Extract and return data from a single record.
    Parameters:
    ---------------------------------------------
    item: one search result from the search term.
    '''
    
    #description and url
    atag = item.h2.a
    description = atag.text
    url = 'https://www.amazon.com' + atag.get('href')
    
    #Price
    try:
        price = item.find('span', class_ = 'a-price').find('span', class_ = 'a-offscreen').text
    except AttributeError:
        return
    
    #Tuple of description, url, and price
    results = (description, url, price)
    return results

In [69]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

## Putting it all together

In [75]:
import csv
from bs4 import BeautifulSoup
import requests
from selenium import webdriver

def get_url(search_term):
    '''
    Creates correct format of url from a specific search term.
    Parameters:
    --------------------------------------------
    search_term: string of product that is wanted to be searched.
    '''
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss'
    search_term = search_term.replace(' ', '+')
    return(template.format(search_term))

def extract_record(item):
    '''
    Extract and return data from a single record.
    Parameters:
    ---------------------------------------------
    item: one search result from the search term.
    '''
    
    #description and url
    atag = item.h2.a
    description = atag.text
    url = 'https://www.amazon.com' + atag.get('href')
    
    #Price
    try:
        price = item.find('span', class_ = 'a-price').find('span', class_ = 'a-offscreen').text
    except AttributeError:
        return
    
    #Tuple of description, url, and price
    results = (description, url, price)
    return results

def find_amazon_prices(search_term):
    '''
    Finds the prices of a search term and returns their description and price.
    Parameters:
    ---------------------------------------------
    search_term: string of product we will be searching the price for on amazon.
    '''
    
    #start the driver
    driver = webdriver.Firefox(executable_path=r'/Users/michael/Downloads/geckodriver')
    
    records = []
    url = get_url(search_term)
    
    #Get the information from each item in the search
    driver.get(url.format())
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = soup.find_all('div', {'data-component-type': 's-search-result'})
    for item in results:
        record = extract_record(item)
        if record:
            records.append(record)
    driver.close()
    return records
    #save data to something, right now a csv
    

In [76]:
print(find_amazon_prices('tv'))

[('VIZIO 55-inch M-Series - Quantum 4K HDR Smart TV (54.5-inch diag) (M55Q7-H1, 2020) ', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A10174603VLJS3RXB87QQ&url=%2FVIZIO-55-inch-M-54-5-inch-M55Q7-H1%2Fdp%2FB08R7YVXT8%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3Dtv%26qid%3D1620942726%26sr%3D8-1-spons%26psc%3D1&qualifier=1620942726&id=2226181089429163&widgetName=sp_atf', '$499.99'), ('Toshiba 32LF221U21 32-inch Smart HD 720p TV - Fire TV Edition, Released 2020 ', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A01953861115659ZDUFJQ&url=%2FAll-New-Toshiba-32LF221U21-32-inch-Smart%2Fdp%2FB0872FYTWS%2Fref%3Dsr_1_2_sspa%3Fdchild%3D1%26keywords%3Dtv%26qid%3D1620942726%26sr%3D8-2-spons%26psc%3D1&qualifier=1620942726&id=2226181089429163&widgetName=sp_atf', '$199.99'), ('TCL 50-inch Class 4-Series 4K UHD Smart Roku LED TV - 50S435, 2021 Model ', 'https://www.amazon.com/TCL-4K-Smart-LED-50S435