In [None]:
# default_exp scrape

# scrape

> This is a collection of routines for dataset-building via web scraping


This was first shared by [akashgshastri on Fast.ai forums](https://forums.fast.ai/t/google-image-scraper/79682/6), which I've updated for Colab, 

...and added some tools to visualize the dataset after it's scraped.   -- Scott Hawley

## You want to install Kora

[kora](https://pypi.org/project/kora) for Colab gives you Selenium, Bokeh & much more!:


In [None]:
#export
#!pip install kora -q
from kora.selenium import wd

## And now to the scraping...

In [None]:
# export

def fetch_image_urls(query:str, max_links_to_fetch:int, wd:wd, sleep_between_interactions:int=1):
    def scroll_to_end(wd_old):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"\n{query}: Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"{query}: Found: {len(image_urls)} image links, done!")
                break
        else:
            print(f"{query}: Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls


def download_and_save(folder_path:str, url:str, verbose:bool=True):
    success = False
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")
        
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        if verbose:  print(f"SUCCESS - saved {url} - as {file_path}")
        success = True
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")
    return success
    

def search_and_download(search_term:str, target_path:str='./images', number_images:int=10, verbose:bool=True):
    
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    try_urls = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)

    count, urls = 0, []      # save count of success and urls whose images were successfully saved
    for url in try_urls:
        rc = download_and_save(target_folder, url, verbose=verbose)
        if rc:
            count += 1
            urls.append(url)
    
    print(f"{search_term}: Expected {number_images}, succeeded at saving {count}.")
    return count, urls 


# work inprogress
class Category():
    def __init__(self):
        self.images = []
        self.urls = []

    def __len__(self):
        ni, nu = len(self.images), len(self.urls)
        assert ni==nu 
        return ni

In [None]:
#export
import os 
import time
import requests
import io
from PIL import Image, ImageOps
import hashlib
import shutil
import glob


def img_scrape(search_terms:list, target_path:str='./.images', number_images:int=10, verbose:bool=True):
    
     # clear out directory before use
    for f in glob.glob(os.path.join(target_path, "*")):
        os.remove(f)
        
    dataset = {key: Category() for key in search_terms}
    
    for term in search_terms:
        count, urls = search_and_download(search_term = term, target_path=target_path, number_images=number_images)
        dataset[term].urls = urls   # save urls in case we want them later

    return dataset

In [None]:
# And now we actually do the scraping 

'''
### Variables to adjust

1. set search_term to an array of strings for which you want images
2. set number_images to the number of images you want for each class
3. set target_path to the path where you want images dataset created.
'''
search_terms = ["dog", "cat", "horse"]                     # the usual
#search_terms = ["les paul guitar", "stratocaster guitar"] # H/T Nathan Sepulveda
#search_terms = ["alligator", "crocodile"]                 # tricky
#search_terms = ["blue sky", "stop sign"]                  # easy: these separate by color!
#search_terms = ["smart person", "stupid person"]          # this is going to be a bad idea! (ethics)

number_images = 10
target_path = './.images'            # where to save to

In [None]:
# grab everything! 
dataset = img_scrape(search_terms, target_path=target_path, number_images=number_images)


dog: Found: 100 search results. Extracting links from 0:100
dog: Found: 10 image links, done!
SUCCESS - saved https://images.theconversation.com/files/319375/original/file-20200309-118956-1cqvm6j.jpg?ixlib=rb-1.1.0&q=45&auto=format&w=1200&h=900.0&fit=crop - as ./.images/dog/3a9e97b232.jpg
SUCCESS - saved https://www.sciencemag.org/sites/default/files/styles/inline__450w__no_aspect/public/dogs_1280p_0.jpg?itok=4t_1_fSJ - as ./.images/dog/56f12be619.jpg
SUCCESS - saved https://img.webmd.com/dtmcms/live/webmd/consumer_assets/site_images/article_thumbnails/other/dog_cool_summer_slideshow/1800x1200_dog_cool_summer_other.jpg - as ./.images/dog/37f0960b6e.jpg
SUCCESS - saved https://i.natgeofe.com/n/4f5aaece-3300-41a4-b2a8-ed2708a0a27c/domestic-dog_thumb.jpg - as ./.images/dog/4df813f7f1.jpg
SUCCESS - saved https://i.ytimg.com/vi/MPV2METPeJU/maxresdefault.jpg - as ./.images/dog/b61b601eea.jpg
SUCCESS - saved https://i.guim.co.uk/img/media/fe1e34da640c5c56ed16f76ce6f994fa9343d09d/0_174_3408_2

Let's see what's been saved to disk:

In [None]:
!ls {target_path}/*

./.images/cat:
00a6e9ddc2.jpg	72c56919f9.jpg	8a43684728.jpg	e0a61ca868.jpg	e98fd0f834.jpg
0bae603340.jpg	77554c65d8.jpg	a5ffbb2606.jpg	e495c20532.jpg

./.images/dog:
1bd016f884.jpg	3a9e97b232.jpg	4df813f7f1.jpg	56f12be619.jpg	e886dbf2be.jpg
37f0960b6e.jpg	3fb7d7c4ef.jpg	4fdbeb98ad.jpg	b61b601eea.jpg	eafde83cd9.jpg

./.images/horse:
1e01a5a766.jpg	442ed9a65e.jpg	99a2fe09a1.jpg	eee46f7aa3.jpg	fe8773fedb.jpg
22e31d3c18.jpg	616a7424e4.jpg	c2f404477d.jpg	f671ed9f0e.jpg


Now we should probably inspect the data to see if it looks good or if we accidentally grabbed images we don't want. 

### Extra: Interactive Image Browser (Slider)

In [None]:
%matplotlib inline

import numpy as np 

# Load images from disk
for term in search_terms:
    dir = term.replace(' ','_')  # spaces to underscores for disk access
    dir = f'{target_path}/{dir}/'
    dataset[term].images = [Image.open(item) for i in [glob.glob(f'{dir}*.{ext}') for ext in ["jpg","gif","png","tga"]] for item in i]
    print(f'Loaded {len(dataset[term].images)} images for {term}')


Loaded 10 images for dog
Loaded 9 images for cat
Loaded 9 images for horse


In [None]:
#export
import matplotlib.pyplot as plt
from ipywidgets import interact
def browse_images(dataset):
    print("Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.")
    @interact(term=search_terms)
    def _browse_images(term):
        n = len(dataset[term])
        def view_image(i):
            plt.imshow(dataset[term].images[i], cmap=plt.cm.gray_r, interpolation='nearest')
            plt.show()
        interact(view_image, i=(0,n-1))

In [None]:
browse_images(dataset)

Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.


interactive(children=(Dropdown(description='term', options=('dog', 'cat', 'horse'), value='dog'), Output()), _…

### Extra: Hover over points and see images
Using [kora](https://pypi.org/project/kora/) & [Bokeh](https://docs.bokeh.org/en/latest/docs/user_guide/tools.html#custom-tooltip).

TODO: Issue is that we have to load images *again* from the *original image URLs* in order for this to work. ...this is gonna be amazingly slow, we should use thumbnails somehow.

In [None]:
#export
from kora.bokeh import figure
from bokeh.plotting import ColumnDataSource, output_file, show
from numpy import linalg as LA

# We'll use PCA to get x & y coordinates at which to plot data points
#  For that, we need to crop/resize the images to make a uniform array shape
#  Note, for now we're not cropping, so the resize skews the images!
def get_pca_xy():

    def sorted_eig(A):  # returns sorted eigenvalues (& their corresponding eignevectors) of A
        lambdas, vecs = LA.eig(A)
        # Next line just sorts values & vectors together in order of decreasing eigenvalues
        lambdas, vecs = zip(*sorted(zip(list(lambdas), list(vecs.T)),key=lambda x: x[0], reverse=True))
        return lambdas, np.array(vecs).T  # un-doing the list-casting from the previous line

    # In the next line, try putting "ImageOps.grayscale( )" around the "img" in front of the ".resize" ;-) 
    imlist = [np.asarray( img.resize((18, 18), Image.ANTIALIAS)).flatten() # resize to tiny, then flatten to 1D
        for term in search_terms for img in dataset[term].images]
    images_array = np.array(imlist)
    print("images_array.shape = ",images_array.shape)

    im_cov = np.cov(images_array.T)   # get covariance matrix
    lambdas, vecs = sorted_eig(np.array(im_cov))  # get the eigenvectors
    W = vecs[:,0:2]                      # Grab the 2 most significant dimensions
    proj_img = np.array(images_array @ W, dtype=np.float32)  # Last step of PCA: projection 

    xs=proj_img[:,0]  # this is for both classes together as one long array
    ys=proj_img[:,1]
    return xs, ys 


def plot_hover(dataset):
    xs, ys = get_pca_xy()

    output_file("toolbar.html")

    colors = ["blue","red","green"]

    TOOLTIPS = """
        <div>
            <div>
                <img
                    src="@imgs" height="50" alt="@imgs" 
                    style="float: left; margin: 0px 15px 15px 0px;"
                    border="2"
                ></img>
            </div>
            <div>
                <span style="font-size: 17px; font-weight: bold;">@desc</span>
                <span style="font-size: 15px; color: #966;">[$index]</span>
            </div>
            <!---commenting-out coordinate values <div>
                <span style="font-size: 10px; color: #696;">($x, $y)</span>
            </div> -->
        </div>
    """

    p = figure(400,400,  tooltips=TOOLTIPS, title="Mouse over the dots")

    start = 0    # where to read from x & y arrays
    for i, term in enumerate(search_terms):

        n = len(dataset[term].urls)  # TODO: stop using full image urls, better to use thumbnails

        #x, y = np.random.rand(n)+i/2, np.random.rand(n) # just random points for now
        x, y  = xs[start:start+n], ys[start:start+n]

        source = ColumnDataSource( data=dict(x=x, y=y, desc=[term]*n, 
            imgs=dataset[term].urls) )

        p.circle('x', 'y', size=12, line_color=colors[i], fill_color=colors[i], source=source)
        start += n 

    #show(p)
    return p
    


In [None]:
plot_hover(dataset)

images_array.shape =  (28, 972)


