In [1]:
#code to have Jupyter Notebook install packages
import sys
!pip install --upgrade pip
!pip install selenium
!pip install webdriver-manager
!pip install pandas
!pip install pyttsx3







In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import pyttsx3

In [4]:
#funciton that returns 'True' or 'False' if a string('input') contains another string('string')
def contains_string(input, string):
    found = input.lower().find(string.lower())
    if found != -1:
        return True
    else:
        return False

In [5]:
#function that takes a string('string') and searches it for a list('port_list') of other strings; returns a list of strings found in 'string'
def find_ports(string, port_list=['HDMI', 'VGA', 'DVI', 'DisplayPort', 'Display Port', 'Thunderbolt', 'Type-C']):
    found_ports = []
    for port in port_list:
        if contains_string(string, port):
            found_ports.append(port)
    return found_ports


In [6]:
#main function of program; contains all of the scraping logic
def get_product_list(num_pages, driver, base_search):
    #creates the list conatining all product data; formatted as a list of dictionaries
    master_list = []
    #for loop that iterates over all of the pages for the search
    for page in range(1, num_pages+1):
        print(f'Starting page: {page}')
        #creates the list conatining product data for the current page; formatted as a list of dictionaries
        #think of this variable as the master list for an individual page
        page_list = []
        #takes the driver to our base search(looks like: 'amazon.com/s?k=computer+monitors?p={}')
        driver.get(base_search.format(page))
        #gets a list of web page elements containg our products; identified by the class 'template=SEARCH_RESULTS'
        results = driver.find_elements(By.CLASS_NAME, 'template\=SEARCH_RESULTS')
        #for loop that goes over each product result from the search page
        for result in results:
            #checks if the product result has the sponsored tag and if it doesn't carries on
            try:
                result.find_element(By.CLASS_NAME, 'a-spacing-micro')
            except:
                #creates the feature dict containing all of the product information; is used to fill 'page_list' and 'master_list'
                feature_dict = {}
                #finds the header for the product result
                header = result.find_element(By.TAG_NAME, 'h2')
                #grabs the product name from the header
                feature_dict['Name'] = header.text
                #grabs the url for the product from the header's <a> tag
                feature_dict['Url'] = header.find_element(By.TAG_NAME, 'a').get_attribute('href')
                #tries to find a <div> containing either rating information or shipping info
                try:
                    #finds a <div> containing rating information
                    #if a product doesn't have any ratings the 'rating' variable instead is a <div> containg shipping information
                    rating = result.find_element(By.CLASS_NAME, 'a-spacing-top-micro')
                    #finds the <span> tags from the rating <div>, which contain the rating info
                    rating_spans = rating.find_elements(By.TAG_NAME, 'span')
                    #finds the overall rating for the product; contained in the <span>'s 'aria-label' attribute
                    overall_rating = rating_spans[0].get_attribute('aria-label')
                    #finds the number of ratings for the product
                    num_ratings = rating_spans[4].text
                    #determines if the rating info gathered is rating info or shipping information
                    try:
                        #tries to convert the number of ratings into an integer; throws an error if 'num_ratings' contains shipping info
                        int(num_ratings.replace(',', '')) 
                        #inputs data into our feature dict if the data is valid
                        feature_dict['Rating'] = overall_rating
                        feature_dict['#Ratings'] = num_ratings
                    except:
                        #inputs dummy data into our feature dict if the data is invalid(didn't exist and gave us shipping info instead)
                        feature_dict['Rating'] = 'No Rating'
                        feature_dict['#Ratings'] = 'No Rating'
                except:
                    #inputs dummy data into our feature dict if no rating field comes back
                    feature_dict['Rating'] = 'No Rating'
                    feature_dict['#Ratings'] = 'No Rating'
                #finds the price of the product if available, otherwise moves on
                try:
                    feature_dict['Price'] = result.find_element(By.CLASS_NAME, 'a-price').text.replace('\n', '.').replace(',', '')
                except:
                    pass
                #looks for features for products listed on the search result screen
                #computer monitors on amazon have their display size, refresh rate, resolution, and response time listed here
                #if a product doesn't contain any features here the code just moves on
                try:
                    #grabs the webpage element containing the product features
                    features = result.find_element(By.CLASS_NAME, 's-product-specs-view')
                    #grabs a list of the different features which we use as dictionary keys
                    feature_keys = features.find_elements(By.CLASS_NAME, 'a-color-secondary')
                    #grabs the values of the different features
                    feature_values = features.find_elements(By.CLASS_NAME, 'a-text-bold')
                    #fills out our feature dict with the found keys('k') and values('v')
                    for k, v in zip(feature_keys, feature_values):
                        feature_dict[k.text] = v.text
                except:
                    pass
                #adds the feature dict to our 'page_list' variable
                page_list.append(feature_dict)
        #after going over all of the search results and grabbing data about them, goes to each product's page and grabs more info
        #note: 'product' is the same dictionary as 'feature_dict'
        for product in page_list:
            #takes the webdriver to the product's url
            driver.get(product['Url'])
            #grabs the title for the product
            title = driver.find_element(By.CLASS_NAME, 'product-title-word-break').text
            #grabs the 'product overview' section for the product and does some light cleaning
            product_overview = driver.find_element(By.ID, 'productOverview_feature_div').text.replace('\n', ' ')
            #grabs the 'feature bullets' section for the product and does some light cleaning
            feature_bullets = driver.find_element(By.ID, 'featurebullets_feature_div').text.replace('\n', ' ')
            #tries to grab the 'from the manufacturer' section
            try:
                from_manufacturer = driver.find_element(By.ID, 'aplus_feature_div').text.replace('\n', ' ')
            #if the 'from the manufacturer' section can't be found, sets it to an empty string
            except:
                from_manufacturer = ''
            #grabs the 'product details' section for the product and stores it as a web element
            product_details = driver.find_element(By.ID, 'productDetails_feature_div')
            #amazon has two different layouts for the product details section: one new one with collapsing fields, and an old one with more plain text
            #this tries and grabs the 'technical details' subsection for the newer layout and if it fails grabs it from the old layout
            try:
                technical_details = product_details.find_element(By.CLASS_NAME, 'a-expander-extend-content').text.replace('\n', ' ')
            except:
                technical_details = product_details.find_element(By.ID, 'productDetails_detailBullets_sections1').text.replace('\n', ' ')
            #grabs the 'description' section for the product and does some light cleaning
            description = driver.find_element(By.ID, 'productDescription_feature_div').text.replace('\n', ' ')
            #tries to find the 'what's in the box section' from amazon
            try:
                in_the_box = driver.find_element(By.ID, 'whatsInTheBoxDeck').text.split('\n')[1]
            #if it fails it sets the variable to an empty string
            except:
                in_the_box = ''
            #creates a variable containing all of the product's descriptive text for searching through
            super_description = f'Title: {title}, Overview: {product_overview}, Bullets: {feature_bullets}, From Manufacturer: {from_manufacturer}, Techincal: {technical_details}, Description: {description}, Box: {in_the_box}'

            #looks in the technical detail section to see if it contains a resolution field
            if contains_string(technical_details, 'Screen Resolution'):
                #if it does it takes the resolution from the 'technical details' section and returns it as a list that looks like: ['1920', 'x', '1080']
                resolution = technical_details[technical_details.find('Screen Resolution') + 17:].split(' ')[1:4]
                #consolidates and cleans up the resoultion (looks like: '1920x1080' now) and overides the previously gotten resolution from the above features area
                #we do this because the previous resolution variable we got isn't very detailed (looks like: '1080p HD' or 'HD')
                #although not all products have the resolution here so the previous value also acts as a back up
                product['Resolution'] = ''.join(resolution).replace('MaxScreen', '').replace('PixelsOther', '')
            #finds and grabs the manufacturer name from the 'technical details' section
            product['Manufacturer'] = technical_details[technical_details.find('Manufacturer') + 13:].split(' ')[0]
            #gets a list of all ports mentioned on the product page and converts it into a string and assigns it to our 'product' dictionary
            product['Ports'] = ', '.join(find_ports(super_description))
            #searches the product's descriptive text to see if it being curved is mentioned or not and assigns that result to our 'product' dictionary
            product['Curved'] = contains_string(super_description, 'curved')
            #searches the product's descriptive text to see if it having speakers is mentioned or not and assigns that result to our 'product' dictionary
            product['Speakers'] = contains_string(super_description, 'speakers')
            #searches the product's descriptive text to see if it being height adjustable is mentioned or not and assigns that result to our 'product' dictionary
            product['Height Adjustable'] = contains_string(super_description, 'height')
            #adds the final product dictionary to our master list
            master_list.append(product)
    #returns the master list after iterating over all of the pages
    return master_list

In [7]:
#instantiates the webdriver
driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/erjonhome/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache
  


In [8]:
#variable containg our target home url(beginning of search)
home_url = 'https://www.amazon.com/s?k=computer+monitor&page=1&crid=2HRQHDDQ8D5R0&qid=1648657135&sprefix=computer+monitor%2Caps%2C192&ref=sr_pg_2'
#variable containing our url for searching
search_url = 'https://www.amazon.com/s?k=computer+monitor&page={}&crid=2HRQHDDQ8D5R0&qid=1648657135&sprefix=computer+monitor%2Caps%2C192&ref=sr_pg_2'

In [9]:
#takes our webdriver to our home url(beginning of search)
driver.get(home_url)
#finds the disabled paginations objects on the search page (example for the first page: page left page arrow and the last page button)
disabled_paginations = driver.find_elements(By.CLASS_NAME, 's-pagination-disabled')
#checks if more than one pagination came through (only gave last page button or gave last page button and left page arrow)
if len(disabled_paginations) > 1:
    #sets 'page_max' to the last page returned for the search
    page_max = int(disabled_paginations[1].text)
else:
    #sets 'page_max' to the last page returned for the search
    page_max = int(disabled_paginations[0].text)

In [10]:
#defines number of pages to iterate over; example: 'num_pages = 5' will do the first 5 pages, 'num_pages = page_max' will do all available pages 
num_pages = 15
#checks if number of requested pages is greater than the number available and adjusts automatically to avoid errors
if num_pages > page_max:
    print(f'Search only returns {page_max} pages, you requested {num_pages}. Will only return {page_max} pages.')
    num_pages = page_max

In [11]:
#gets the full list of products using given variables
product_list = get_product_list(num_pages, driver, search_url)
#closes the webdriver now that we are done with it
driver.close()

Starting page: 1
Starting page: 2
Starting page: 3
Starting page: 4
Starting page: 5
Starting page: 6
Starting page: 7
Starting page: 8
Starting page: 9
Starting page: 10
Starting page: 11
Starting page: 12
Starting page: 13
Starting page: 14
Starting page: 15


In [12]:
#converts the product list into a pandas dataframe
results_df = pd.DataFrame(product_list)

In [13]:
#saves the dataframe as a csv file
results_df.to_csv('output.csv')

In [14]:
#gives voice notification when code is done running
engine = pyttsx3.init()
engine.say('Your output is ready.')
engine.runAndWait()