<h1 style='font-size:40px'>Python Web Automation Project</h1>
<ul style='font-size:20px'> 
    <li>
        Buying goods on the internet may be a very frustating experience. It may be difficult to find the product we aim to acquire with our desired specifications and an adequate price.
    </li>
    <li>
        For such situations, Python can be an splendid tool for automating tasks that need to be done in the web.
    </li>
    <li>
        This project is focused on developing a program that will access Google Shopping and find the offers that match our specific interests.
    </li>
</ul>

In [6]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pandas import DataFrame
class GoogleShoppingQuery():
    
    # This small static method puts the feature names to lowercase.
    @staticmethod
    def lower(features):
        for feature in features:
            features[features.index(feature)] = feature.casefold()
        return features
    
    def __init__(self, product, features, price_range, driver_path):
        # string.
        self.product = product.casefold()
        
        # list.
        self.features = GoogleShoppingQuery.lower(features)
        
        # tuple of numbers.
        self.price_range = price_range
        
        # A string with your Chrome driver's path.
        self.driver_path = driver_path

    
    # This function collects all the result pages from querying the product's name in Google Shopping.
    def __collect_query_results(self):
        
        # The 'output_pages' list will hold the link for all pages returned by the search bar query.
        output_pages = []

        # Making a query on Google Shopping for each product in the 'products' DF.
        driver = webdriver.Chrome(self.driver_path)
        driver.get('https://shopping.google.com.br/')

        # Now, waiting for the website's search bar to appear.
        try:    
            search_bar = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "r7gAOb"))
                                )
            # When the search bar is found, the program will write the product's name in the field and hit the RETURN key.
            search_bar.send_keys(self.product)
            search_bar.send_keys(Keys.RETURN)

        # The page catalogue is stored as an HTML table.
            try:
                pages = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "table"))
                )

                # Each page link the table stores can be bound in its <td> tag.
                page_link = pages.find_elements_by_tag_name('td') 

                # Retrieving the links.
                for page in page_link:
                    a = page.find_elements_by_tag_name('a') 

                    # Some of the <td>'s do not have any <a> tag at all, so we are not going to append them into 
                    # the 'output_pages'. list.
                    if a != []:
                        output_pages.append(a[0].get_attribute('href'))
            except:
                driver.quit()
        except:
            driver.quit()
            
        # The 'output_pages' is returned as the output from the function.
        return output_pages
    
    # Now, this second method is going to be responsible for analyzing the offers from the pages identified by '__collect_query_results'
    # and storing the appropriate one in a pandas DataFrame.
    def analyze_offers(self):
        driver = webdriver.Chrome(self.driver_path)
        min_price, max_price = self.price_range
        
        # The DataFrame in which the convenient merchandises are placed.
        target_products = DataFrame({'Product':[],'Price':[],'Website URL':[]})
        output_pages = self.__collect_query_results()
        
        for page in output_pages:
            driver.get(page)
            
            # This whole 'try ... except' section is focused in looking upon the pages' highlighted offers.
            # This was necessary because they are stored in a <div> with a different class name from the rest of the merchandises.
            try:
                sponsored_offers = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "KZmu8e"))
                            )
                sponsored_offers = driver.find_elements_by_class_name('KZmu8e')

                # Finding the offers and getting their title.
                for offer in sponsored_offers:
                    offer_title = offer.find_element_by_class_name('sh-np__product-title').text.casefold()
                    
                    # Checking if the merchandise being offered has all the features we are looking for.
                    if self.product in offer_title:
                        # If a given desired attribute is not found in the offer's title, the
                        # 'no_attribute_count' will be unequal to 0. This will preclude the merhandise to be
                        # attached to the 'target_products' DF.
                        no_attribute_count = 0
                        for attribute in self.features:
                            if attribute not in offer_title:
                                no_attribute_count +=1

                        # If all the characteristics are found, we'll perform a second verification on the 
                        # product, now seeing if its price is between the minimum and maximum values we've set.
                        if no_attribute_count <1:
                            
                            # Doing this tiny 'try...except' because, contrary to our expectations, some of the offers
                            # do not include their prices!
                            try:
                                price = float(offer.find_element_by_tag_name('b').text.split('R$ ')[1].replace('.','').replace(',','.'))
                            except:
                                continue
                                
                            # Being the price in the range we've defined, the program gets the offer's link.
                            if price in range(min_price, max_price+1):
                                link = offer.find_element_by_class_name('shntl').get_attribute('href')
                                
                                # The product name along its respective price and offer's URL are stored in the 'target_products' DF.
                                target_products.loc[len(target_products)] = self.product, price, link
            finally:
                
                # Now, this part of the function will analyze exclusively the pages' ordinay offers.
                # Essentially, the same operations are performed as with the highlighted merchandises.
                ordinary_offers = driver.find_elements_by_class_name('sh-dgr__content')
                for offer in ordinary_offers:
                    offer_title = offer.find_element_by_class_name('Xjkr3b').text.casefold()
                    if self.product in offer_title:
                        no_attribute_count = 0
                        for attribute in self.features:
                            if attribute not in offer_title:
                                no_attribute_count +=1

                        if no_attribute_count <1:
                            try: 
                                price = float(offer.find_element_by_class_name('a8Pemb').text.split('R$ ')[1].replace('.','').replace(',','.')) #float(offer.find_element_by_class_name('a8Pemb').split('R$ ')[1].replace('.','').replace(',','.'))
                            except:
                                continue
                            if price in range(min_price, max_price +1):
                                link = offer.find_element_by_tag_name('a').get_attribute('href')
                                target_products.loc[len(target_products)] = self.product, price, link       
        return target_products
        

In [613]:
iphone12 = GoogleShoppingQuery('iphone 12', ['64gb'], (3500,4000), PATH).analyze_offers()
iphone12

  driver = webdriver.Chrome(self.driver_path)
  driver = webdriver.Chrome(self.driver_path)
  sponsored_offers = driver.find_elements_by_class_name('KZmu8e')
  ordinary_offers = driver.find_elements_by_class_name('sh-dgr__content')


Unnamed: 0,Product,Price,Website URL
0,iphone 12,3850.0,https://www.google.com.br/url?url=https://www....


In [8]:
import os
PATH = os.environ#.get('CHROMEDRIVER')
PATH
#PATH = '/Users/felipeveiga/Documents/Jupyter USP/Chrome Driver/chromedriver'

environ{'TERM_PROGRAM': 'Apple_Terminal',
        'SHELL': '/bin/bash',
        'TERM': 'xterm-color',
        'TMPDIR': '/var/folders/x3/9ms4m5jd6t378j0s_mcc5j7h0000gq/T/',
        'CONDA_SHLVL': '1',
        'CONDA_PROMPT_MODIFIER': '(base) ',
        'TERM_PROGRAM_VERSION': '433',
        'OLDPWD': '/Users/felipeveiga',
        'MOT_24': 'weareliverpool2410',
        'TERM_SESSION_ID': '83D9EFF3-CDA1-4D09-A187-594B068E064D',
        'EMAIL_24': 'felipeveiga2410@gmail.com',
        'USER': 'felipeveiga',
        'CONDA_EXE': '/Users/felipeveiga/opt/anaconda3/bin/conda',
        'KINDLE_2': 'roberto_veiga_f0e86f@kindle.com',
        'EMAIL_USER': 'felipesveiga@gmail.com',
        'MOT_SP': 'Dudu0603!',
        'SSH_AUTH_SOCK': '/private/tmp/com.apple.launchd.fc3PCwya3I/Listeners',
        'KINDLE_1': 'roberto_veiga_21162a@kindle.com',
        '_CE_CONDA': '',
        'CHROMEDRIVER': '/Users/felipeveiga/Documents/Jupyter USP/Chrome Driver',
        'PATH': '/Users/felipeveiga/opt/anaco