Written by [Gameli Ladzekpo](mailto:gameli.Ladzekpo@gmail.com) (Twitter/IG: @gamladz)

For [AI Core](theaicore.com)

In [1]:
# Start with imports 

import json 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as expected_conditions
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
from time import sleep, time
import random
import re
import subprocess, os
import urllib.request
import sys
import boto3
from botocore.exceptions import ClientError
from selenium import webdriver 
import time
import requests

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

In [4]:
# Set up AWS functions
s3 = boto3.resource("s3").Bucket("ikea-dataset")
json.load_s3 = lambda f: json.load(s3.Object(key=f).get()["Body"])
json.dump_s3 = lambda obj, f: s3.Object(key=f).put(Body=json.dumps(obj))
client = boto3.client('s3', region_name='us-east-2')


In [5]:
# Updated
max_time = 10

def open_chrome(port=9220, on_mac=True):
    my_env = os.environ.copy()
    if on_mac:
        print('opening chrome')
        subprocess.Popen(['open', '-a', "Google Chrome", '--args', f'--remote-debugging-port={port}', 'http://www.example.com'], env=my_env)
    else:
        subprocess.Popen(f'google-chrome --remote-debugging-port={port} --user-data-dir=data_dir'.split(), env=my_env)
    print('opened chrome')

class Bot():
    def __init__(self, port_no=9220, headless=False, verbose=False):
        print('initialising bot')

        print(headless)
        options = Options()
        if headless:
            options.add_argument("--headless")
            print('running headless')
        else:
            open_chrome(port=port_no)
            options.add_experimental_option(f"debuggerAddress", f"127.0.0.1:{port_no}")	# attach to the same port that you're running chrome on
        options.add_argument("--no-sandbox")	# without this, the chrome webdriver can't start (SECURITY RISK)
        #options.add_argument("--window-size=1920x1080")
        self.driver = webdriver.Chrome('chrome_driver/chromedriver')	
        self.verbose = verbose
    
    def scroll(self, x=0, y=10000):
        self.driver.execute_script(f'window.scrollBy({x}, {y})')

    def click_btn(self, text):
        if self.verbose: print(f'clicking {text} btn')
        element_types = ['button', 'div', 'input', 'a', 'label']
        for element_type in element_types:
            btns = self.driver.find_elements_by_xpath(f'//{element_type}')
            # for btn in btns:
            #     print(btn.text)
            
            # SEARCH BY TEXT
            try:
                btn = [b for b in btns if b.text.lower() == text.lower()][0]
                btn.click()
                return
            except IndexError:
                pass

            # SEARCH BY VALUE ATTRIBUTE IF NOT YET FOUND
            try:
                btn = self.driver.find_elements_by_xpath(f'//{element_type}[@value="{text}"]')[0]
                btn.click()
                return
            except:
                continue

        raise ValueError(f'button containing "{text}" not found')

    def _search(self, query, _type='search', placeholder=None):
        sleep(1)
        s = self.driver.find_elements_by_xpath(f'//input[@type="{_type}"]')
        print(s)
        if placeholder:
            s = [i for i in s if i.get_attribute('placeholder').lower() == placeholder.lower()][0]
        else:
            s = s[0]
        s.send_keys(query) 

    def toggle_verbose(self):
        self.verbose = not self.verbose

    def download_file(self, src_url, local_destination):
        response = requests.get(src_url)
        with open(local_destination, 'wb+') as f:
            f.write(response.content)


In [6]:

if __name__ == '__main__':
    bot = Bot()
    
    data_dict = {

    }

    searches = {
        "tables_ikea":'https://www.ikea.com/gb/en/cat/tables-desks-fu004/?page=26',
        "chair_ikea":'https://www.ikea.com/gb/en/cat/chairs-fu002/?page=34',
        "shelves_ikea":'https://www.ikea.com/gb/en/cat/bookcases-shelving-units-st002/?page=20', 
        "Cabinets_ikea":'https://www.ikea.com/gb/en/cat/cabinets-cupboards-st003/?page=20'
    }
    

    for search in searches:
        bot.driver.get(searches[search])
        data_dict[search] = {}

        time.sleep(40)
 
        # Go into the main grid and find the link for each result
        results = bot.driver.find_elements_by_xpath('//*[@data-testid="plp-product-list"]//a')  
        print (f'found {len(results)}) results for search "{search}" ')


        results = [r.get_attribute('href') for r in results]
        print(results)

        for i, result in enumerate(results):

            # Skip promotional links, adverts or non-product links in grid
            if '/p/' not in result:
                continue 

            bot.driver.get(result)
            url = result
            result = result.split('/')[-2]

            # Set up dict for each products result
            results_dict = {
                "id": [],
                "url":[],
                "source":[],
                "price": [],
                "image":[],
                "description":[]
            }
            
            results_dict['id'] = hash(url)            
            results_dict['url'] = url
            results_dict['source'] = 'IKEA'

            # Product price
            prod_price = bot.driver.find_element_by_xpath('//*[@class="range-revamp-price__integer"]')
            prod_price = prod_price.get_attribute('innerHTML')
            results_dict['price'] = prod_price

            prod_desc = bot.driver.find_element_by_xpath('//*[@class="range-revamp-header-section__description-text"]')
            prod_desc = prod_desc.get_attribute('innerHTML')
            results_dict['description'] = prod_desc


            os.makedirs(f'data/{search}/{result}', exist_ok=True)


            # Cycle through images 
            images = bot.driver.find_elements_by_xpath('//*[@class="range-revamp-media-grid__media-container"]//img') 
            image = images[0]
            img_url = image.get_attribute('src')
            

            # Create filename for each with ID and get the file extension
            filename = f'{result}-x'
            file_ext = img_url.split('.')[-1] 
            file_ext = file_ext[ 0 : 3 ]
            # Write file locally
            filepath = f'data/{search}/{result}/{filename}.{file_ext}'
            try:
                urllib.request.urlretrieve(img_url, filepath)
            except ConnectionResetError:
                print ('Connection Error')
            else:
                img_data = requests.get(img_url).content
                with open(filepath, 'wb') as handler:
                    handler.write(img_data)                         
            finally:
                pass
            # Write to cloud
            with open(filepath, 'rb') as data:
                s3.upload_fileobj(data, filepath)
            results_dict['image'].append(filepath)
            # remove comment to remove file locally
            # os.remove(filepath)
                    
            # Remove duplicate images
            results_dict['image'] = [i for j, i in enumerate(results_dict['image']) if i not in results_dict['image'][:j]] 
            # Wrte dict to local
            data_dict[search][i] = results_dict 
            with open('data_4.json', 'w') as json_file:
                json.dump(data_dict, json_file)

            # Write to AWS 
            json.dump_s3(data_dict, "data_json_4")







        


initialising bot
False
opening chrome
opened chrome
found 977) results for search "shelves_ikea" 
['https://www.ikea.com/gb/en/p/kallax-shelving-unit-white-80275887/', 'https://www.ikea.com/gb/en/p/kallax-shelving-unit-white-80275887/', 'https://www.ikea.com/gb/en/p/kallax-shelving-unit-oak-effect-90307492/', 'https://www.ikea.com/gb/en/p/kallax-shelving-unit-white-stained-oak-effect-00324518/', 'https://www.ikea.com/gb/en/p/kallax-shelving-unit-grey-wood-effect-40346924/', 'https://www.ikea.com/gb/en/p/kallax-shelving-unit-white-green-10459903/', 'https://www.ikea.com/gb/en/p/kallax-shelving-unit-black-brown-20275885/', 'https://www.ikea.com/gb/en/p/kallax-shelving-unit-high-gloss-white-10305741/', 'https://www.ikea.com/gb/en/p/eket-cabinet-with-4-compartments-white-60333954/', 'https://www.ikea.com/gb/en/p/eket-cabinet-with-4-compartments-white-60333954/', 'https://www.ikea.com/gb/en/p/eket-cabinet-with-4-compartments-white-stained-oak-effect-40428854/', 'https://www.ikea.com/gb/en/p

WebDriverException: Message: disconnected: Unable to receive message from renderer
  (Session info: chrome=89.0.4389.90)


In [60]:
client = boto3.client('s3', region_name='us-east-2')
response = client.get_object_acl(
    Bucket='ikea-dataset',
    Key='data/plant pots/ingefaera-plant-pot-with-saucer-outdoor-terracotta-50258040/ingefaera-plant-pot-with-saucer-outdoor-terracotta-50258040-4.JPG',
)

print(response)

{'ResponseMetadata': {'RequestId': 'GJW2C5J0B81T0S10', 'HostId': 'OJTSfxfBGH4OHrx3p2RsrwDHl/KjQH2CzGkRd9o6u6TpFD9wsEaLFLe6XMVhWyNFg9W3GCCw6QQ=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'OJTSfxfBGH4OHrx3p2RsrwDHl/KjQH2CzGkRd9o6u6TpFD9wsEaLFLe6XMVhWyNFg9W3GCCw6QQ=', 'x-amz-request-id': 'GJW2C5J0B81T0S10', 'date': 'Fri, 12 Feb 2021 16:10:05 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'Owner': {'ID': 'a3990c786892e7897b56425c9b243ba46076eae807a4c01d61c6ba7422e98c2d'}, 'Grants': [{'Grantee': {'ID': 'a3990c786892e7897b56425c9b243ba46076eae807a4c01d61c6ba7422e98c2d', 'Type': 'CanonicalUser'}, 'Permission': 'FULL_CONTROL'}]}
