# **LAB: selenium**

## **google image scraping**

Dong Gyun Ko <br/>
last Updated: august 15, 2022 <br/>

## **1. install selenium and webdriver**

In [None]:
!pip install Selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [None]:
import os
import time
import socket

from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException
from PIL import Image

## **2. scraping**

### **2.1. scroll_down(): function rolling down scroll**

In [None]:
def scroll_down():

    scroll_count = 0
    print('[scroll_down(): start scroll down]')

    last_height = wd.execute_script('return document.body.scrollHeight')
    after_click=False

    while True:

        print(f'[scroll down: {scroll_count}]')
        wd.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        scroll_count += 1
        time.sleep(1)

        new_height = wd.execute_script('return document.body.scrollHeight')

        if last_height == new_height:
            if after_click is True:
                break
            else:
                try:
                    more_button = wd.find_element('xpath', '//*[@id="islmp"]/div/div/div/div[1]/div[2]/div[2]/input')
                    if more_button.is_displayed():
                       more_button.click()
                       after_click = True
                except NoSuchElementException as e:
                    print(e)
                    break
    
    last_height = new_height

### **2.2. click_and_save(): select the thumb nail image and save the raw image**

In [None]:
def click_and_save(dir_name, index, img, img_list_length):
    global scraped_count

    try:
        img.click()
        wd.implicitly_wait(3)
        src = wd.find_element('xpath', '//*[@id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[3]/div/a/img').get_attribute('src')
        if src.split('.')[-1] == 'png':
           urlretrieve(src, dir_name + '/' + str(scraped_count + 1) + '.png')
           print(f' {index+1}/{img_list_length} save png image')
        else:
           urlretrieve(src, dir_name + '/' + str(scraped_count + 1) + '.jpg')
           print(f' {index+1}/{img_list_length} save jpg image')

        scraped_count += 1
    
    except HTTPError as e:
        print(e)
        pass
    except ElementClickInterceptedException as e:
        print(e)
        pass

### **2.3. scraping(): function scraping google image**

In [None]:
def scraping(dir_name, query):
    global scraped_count

    url = f'https://www.google.com/search?q={query}&tbm=isch&tbs=isz:l&rlz=1C1NDCM_koKR833KR833&hl=ko&sa=X&ved=0CAIQpwVqFwoTCLipiei2y_kCFQAAAAAdAAAAABAC&biw=1903&bih=937#imgrc=HD39zA1_6EYFyM'
    print(url)

    wd.get(url)
    wd.maximize_window()

    scroll_down()

    div = wd.find_element('xpath', '//*[@id="islrg"]/div[1]')
    print(div)

    img_list = div.find_element('cssSelector', 'div.bRMDJf.islir > img')
    print(img_list)

    for index, img in enumerate(img_list):
        try:
            click_and_save(dir_name, index, img, len(img_list))
        except ElementClickInterceptedException as e:
            print(e)
            wd.execute_script('window.scrollTo(0, window.scrollY + 100)')
            time.sleep(1)
            click_and_save(dir_name, index, img, len(img_list))
        except NoSuchElementException as e:
            print(e)
            wd.execute_script('window.scrollTo(0, window.scrollY + 100)')
            time.sleep(1)
            click_and_save(dir_name, index, img, len(img_list))
        except ConnectionResetError as e:
            print(e)
            pass
        except URLError as e:
            print(e)
            pass
        except socket.timeout as e:
            print(e)
            pass
        except socket.gaierror as e:
            print(e)
            pass
        except ElementNotInteractableException as e:
            print(e)
            break
    try:
        print('[scraping end (success rate: %2.f%%)]' % (scraped_count / len(img_list) * 100.0))
    except ZeroDivisionError as e:
        print(e)
    
    wd.quit()

### **2.4. filter_and_remove(): remove low-quality image**

In [None]:
def filter_and_remove(dir_name, query, filter_size):

    filtered_count = 0

    for index, file_name in enumerate(os.listdir(dir_name)):
        try:
            file_path = os.path.join(dir_name, file_name)
            img = Image.open(file_path)

            if img.width < filter_size and img.height < filter_size:
                img.close()
                os.remove(file_path)
                print(f'{index} remove image')
                filtered_count += 1
        
        except OSError as e:
            print(e)
            os.remove(file_path)
            filtered_count += 1
    
    print(f'[number of removed images: {filtered_count}/{scraped_count}]')

### **2.5. execute scaping**

In [None]:
socket.setdefaulttimeout(30)

wd = webdriver.Chrome('chromedriver', options=chrome_options)
scraped_count = 0

path ='./'
query = input('search word: ')

dir_name = path + query
os.makedirs(dir_name)
print(f'[{dir_name} generate directory]')

scraping(dir_name, query)
filter_and_remove(dir_name, query, 400)

In [None]:
!ls

In [None]:
!rmdir forest

In [None]:
from google.colab import files
files.download('./forest/10.jpg')

## **3. references**

* https://www.youtube.com/watch?v=pQ7dOg9c4NI <br/>