Written by [Gameli Ladzekpo](mailto:gameli.Ladzekpo@gmail.com) (Twitter/IG: @gamladz)

For [AI Core](theaicore.com)

In [1]:
# Start with imports 

import json 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as expected_conditions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
from time import sleep, time
import random
import re
import subprocess, os
import urllib.request

In [2]:
# open_chrome function
def open_chrome(port=9220, on_mac=True):
    my_env = os.environ.copy()
    if on_mac:
        subprocess.Popen(['open', '-a', "Google Chrome", '--args', f'--remote-debugging-port={port}', 'http://www.example.com'], env=my_env)
    else:
        subprocess.Popen(f'google-chrome --remote-debugging-port={port} --user-data-dir=bots'.split(), env=my_env)

class Bot():
    def __init__(self, port_no = 9220, headless = False, verbose = False):
        print('initialising bot')

        open_chrome()

        options = Options()
        options.add_argument("--no-sandbox")	# without this, the chrome webdriver can't start (SECURITY RISK)
        options.add_experimental_option(f"debuggerAddress", f"127.0.0.1:{port_no}")	# attach to the same port that you're running chrome on
        if headless:
            options.add_argument("--headless") # headless option allows scraper to run in the background
        #options.add_argument("--window-size=1920x1080")
        self.driver = webdriver.Chrome('chrome_driver/chromedriver')			# create webdriver
        self.verbose = verbose

    def click_btn(self, text):
        if self.verbose: print(f'clicking {text} btn')
        element_types = ['button', 'div', 'input', 'a', 'label']
        
        for element_type in element_types:
            btns = self.driver.fund_elements_by_xpath(f'//{element_type}')
            # for btn in btns:
            #   print(btn.text)

            # SEARCH BY TEXT
            try:
                btn = [b for b in btns if b.text.lower() == text.lower()][0]
                btn.click()
                return
            except IndexError:
                pass

            # SEARCH BY VALUE ATTRIBUTE IF NOT YET FOUND
            try:
                btn = self.driver.find_elements_by_xpath(f'//{element_type}[@value="{text}"]')[0]
                btn.click()
                return
            except:
                continue

        raise ValueError(f'button containing "{text}" not found')

    def _search(self, query, _type='search', placeholder=None):
        sleep(1)
        s = self.driver.find_elements_by_xpath(f'//input[@type="{_type}"]')
        print(s)
        if placeholder:
            s = [i for i in s if i.get_attribute('placeholder').lower() == placeholder.lower()][0]
        else:
            s = s[0]
        s.send_keys(query) 

    def toggle_verbose(self):
        self.verbose = not self.verbose

In [7]:
from selenium import webdriver 
import time

if __name__ == '__main__':
    # EXAMPLE USAGE
    bot = Bot()

    searches = ['plant pots','plates', 'bin']
    for search in searches:
        bot.driver.get(f'https://www.ikea.com/gb/en/search/products/?q={search}')

        # How to scroll on selenium

        while True:
            try:
                bot.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(5)
                show_more_button = bot.driver.find_elements_by_xpath('//*[@class="show-more__button button button--secondary button--small"]')[0]
                show_more_button.click()
            except:    
                pass



        # Go into the main grid and find the link for each result
        results = bot.driver.find_elements_by_xpath('//*[@id="search-results"]/div/a')  
        print (f'found {len(results)}) results for search "{search}" ')

        pjpwj

        results = [r.get_attribute('href') for r in results] 


        for result in results:
            bot.driver.get(result)
            result = result.split('/')[-2]

            # Product Description
            prod_name = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-header-section__title--big"]')
            prod_name = [text.get_attribute('innerHTML') for text in prod_name]
            prod_name = prod_name[0]
            print(prod_name)

            prod_price = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-price__integer"]')
            prod_price = [text.get_attribute('innerHTML') for text in prod_price]
            prod_price = prod_price[0]
            print(prod_price)

            prod_desc = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-header-section__description-text"]')
            prod_desc = [text.get_attribute('innerHTML') for text in prod_desc]
            prod_desc = prod_desc[0]
            print(prod_desc)

            joij



            # First get the image - product dimensions image
            prod_dims_images = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-product-dimensions-content__images"]//img')
            prod_dims_images = [i.get_attribute('src') for i in prod_dims_images]
 
            for image in prod_dims_images:
                print (prod_dims_images)

                for idx, img_url in enumerate(prod_dims_images):
                    filename = f'dims-{result}-{idx}'
                    file_ext = img_url.split('.')[-1] 
                    file_ext = file_ext[ 0 : 3 ]
                    urllib.request.urlretrieve(img_url, f'data/{result}/{filename}.{file_ext}')

            # Second get the description text
            prod_dims_name = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-product-dimensions__list-container"]//dt')
            prod_dims_name = [item.get_attribute('innerHTML').split(":")[0] for item in prod_dims_name]
            
            prod_dims_measure = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-product-dimensions__list-container"]//dd')
            prod_dims_measure = [item.get_attribute('innerHTML') for item in prod_dims_measure]
            dims = dict(zip(prod_dims_name, prod_dims_measure))

            print(dims)
            

            # Cycle through information on each page starting with images   
            images = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-media-grid__media-container"]//img') 
            images = [i.get_attribute('src') for i in images]

            # Product Details
            product_details = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-product-details__paragraph"]')
            print (len(product_details))

            # Materials
            product_details = [paragraph.get_attribute('innerHTML') for paragraph in product_details]
            product_details = " ".join(product_details)

            product_details_materials = bot.driver.find_elements_by_xpath('//*[@id="SEC_product-details-material-and-care"]//span')
            product_details_materials = [paragraph.get_attribute('innerHTML') for paragraph in product_details_materials]
            product_details_materials = ",".join(product_details_materials)


            product_details_sustain = bot.driver.find_elements_by_xpath('//*[@id="SEC_product-details-sustainability-and-environment"]//span')
            product_details_sustain = [paragraph.get_attribute('innerHTML') for paragraph in product_details_sustain]
            product_details_sustain = ",".join(product_details_materials)
             
            # Packaging

            os.makedirs(f'data/{result}', exist_ok=True)


            # Get reviews
            
            for image in images:
                # Loop through each image and save to disk

                for idx, img_url in enumerate(images):

                    # Create filename for each with ID and get the file extension
                    filename = f'{result}-{idx}'
                    file_ext = img_url.split('.')[-1] 
                    file_ext = file_ext[ 0 : 3 ]
                    # Write to file
                    urllib.request.urlretrieve(img_url, f'data/{result}/{filename}.{file_ext}')


 
          
            



        


initialising bot
