In [None]:
# default_exp scrape

In [None]:
#hide 
# Do this on Colab and then restart runtime:
#!pip install fastai --upgrade | grep -v 'already satisfied'
#!pip install mrspuff --upgrade | grep -v 'already satisfied'

# scrape

> This is a collection of routines for dataset-building via web scraping, intended to be run on Google Colab

This is modified from Google-scraping code first shared [by akashgshastri on Fast.ai forums](https://forums.fast.ai/t/google-image-scraper/79682/6) which I then updated.

*Note: Turns out that Jeremy Howard & Sylvain Gugger had already taught scraping in the [2020 version of the FastAI course](https://github.com/fastai/course2020), and provided some useful routines. So, the code I'd had written previously, I'm going to remove and replace with slighly modified versions of theirs.  (In particular, their DuckDuckGo scraper doesn't require any Selenium ChromeDriver like the Google-scraping code I'd written. Which is great because that was messing up the CI on GitHub anyway.)*



In [None]:
#export
from fastcore.basics import *
import numpy as np 
import re
import requests 
import json 
import os, io  
from PIL import Image, ImageOps
import hashlib
import shutil
import glob
from pathlib import Path
import subprocess
import time 
from IPython.display import HTML
import matplotlib.pyplot as plt
from ipywidgets import interact
from mrspuff.utils import calc_prob, on_colab 
import pandas as pd

In [None]:
#export

#modified from fastbook utils, https://github.com/fastai/course20/blob/master/fastbook/__init__.py
#by Jeremy Howard and Sylvain Gugger.  Just removed the .decode() formatting, and replaced L() with list()
def search_images_ddg(key,max_n=200):
    """By Howard & Gugger: Search for 'key' with DuckDuckGo and return a unique urls of 'max_n' images
    (Adopted from https://github.com/deepanprabhu/duckduckgo-images-api)
    """
    url        = 'https://duckduckgo.com/'
    params     = {'q':key}
    res        = requests.post(url,data=params)
    searchObj  = re.search(r'vqd=([\d-]+)\&',res.text)
    if not searchObj: print('Token Parsing Failed !'); return
    requestUrl = url + 'i.js'
    headers    = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}
    params     = (('l','us-en'),('o','json'),('q',key),('vqd',searchObj.group(1)),('f',',,,'),('p','1'),('v7exp','a'))
    urls       = []
    while True:
        try:
            res  = requests.get(requestUrl,headers=headers,params=params)
            data = json.loads(res.text)
            for obj in data['results']:
                urls.append(obj['image'])
                max_n = max_n - 1
                if max_n < 1: return list(set(urls))     # dedupe
            if 'next' not in data: return list(set(urls))
            requestUrl = url + data['next']
        except:
            pass

In [None]:
#export 
def download_and_save(folder_path:str, url:str, verbose:bool=True):
    success = False
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")
    
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        #if verbose:  print(f"SUCCESS - saved {url} - as {file_path}")
        success = True
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")
        file_path = ''
    return file_path
    

def search_and_download(search_term:str, df=None, target_path:str='./images', num_images:int=10, verbose:bool=True):
    
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    try_urls = search_images_ddg(search_term, max_n=num_images)
    print(f"...got {len(try_urls)} urls for term '{search_term}'")

    if df is None: df = pd.DataFrame(columns = ['file_path', 'label', 'orig_url'])
    for url in try_urls:
        file_path = download_and_save(target_folder, url, verbose=verbose)
        if file_path != '':
            df = df.append({ 'file_path' : file_path, 'label':search_term, 'orig_url' : url}, ignore_index = True)
    
    if verbose: print(f"{search_term}: Expected {num_images}, succeeded at saving {len(df)}.")
    return df   # return a dataframe storing successully downloaded files and urls they came from


In [None]:
#export
def img_scrape(search_terms:list, target_path:str='./.images', num_images:int=10, verbose:bool=True):
    
     # clear out directory before use
    for category_path in glob.glob(os.path.join(target_path, "*")):
        shutil.rmtree(category_path)
        
    df = pd.DataFrame(columns = ['file_path', 'label', 'orig_url'])
    for term in search_terms:
        if verbose: print(f"Searching on term '{term}'...")
        #df = pd.concat([df, search_and_download(search_term = term, target_path=target_path, num_images=num_images)], axis=0)
        df = df.append(search_and_download(search_term = term, target_path=target_path, num_images=num_images), ignore_index = True)

    return df # return list of file paths and original urls

In [None]:
'''
### Variables to adjust

1. set search_term to an array of strings for which you want images
2. set num_images to the number of images you want for each class
3. set target_path to the path where you want images dataset created.
'''
search_terms = ['cat','dog','horse']                    
#search_terms = ["les paul guitar", "stratocaster guitar"] # H/T Nathan Sepulveda
#search_terms = ["alligator", "crocodile"]                 # tricky
#search_terms = ["blue sky", "stop sign"]                  # easy: these separate by color!
#search_terms = ["smart person", "stupid person"]          # this is going to be a bad idea! (ethics)

num_images = 10
target_dir = 'scraped_images'            # where to save to

In [None]:
df = img_scrape(search_terms, target_path=target_dir, num_images=num_images, verbose=True)
print(len(df),"images")
df

Searching on term 'cat'...
...got 10 urls for term 'cat'
cat: Expected 10, succeeded at saving 10.
Searching on term 'dog'...
...got 10 urls for term 'dog'
ERROR - Could not download https://www.hdnicewallpapers.com/Walls/Big/Dog/Three_Cute_Doggy_in_Row.jpg - HTTPSConnectionPool(host='www.hdnicewallpapers.com', port=443): Max retries exceeded with url: /Walls/Big/Dog/Three_Cute_Doggy_in_Row.jpg (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc9a8889f90>: Failed to establish a new connection: [Errno -2] Name or service not known'))
ERROR - Could not save https://www.hdnicewallpapers.com/Walls/Big/Dog/Three_Cute_Doggy_in_Row.jpg - local variable 'image_content' referenced before assignment
dog: Expected 10, succeeded at saving 9.
Searching on term 'horse'...
...got 10 urls for term 'horse'
ERROR - Could not save https://horsespirit.site/wp-content/uploads/2020/04/shir10.jpg - cannot identify image file <_io.BytesIO object at 0x7fc9a8859f50>
ERROR 

Unnamed: 0,file_path,label,orig_url
0,scraped_images/cat/d4df5a839c.jpg,cat,https://upload.wikimedia.org/wikipedia/commons/d/dc/Grumpy_Cat_(14556024763)_(cropped).jpg
1,scraped_images/cat/b3a14c4060.jpg,cat,https://wallup.net/wp-content/uploads/2019/09/708281-kittens-kitten-cat-cats-baby-cute.jpg
2,scraped_images/cat/74da23aa13.jpg,cat,https://upload.wikimedia.org/wikipedia/commons/thumb/6/66/An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg/1200px-An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg
3,scraped_images/cat/7655f2e055.jpg,cat,https://www.sciencenews.org/wp-content/uploads/2020/03/033120_HT_covid-cat_feat-1028x579.jpg
4,scraped_images/cat/671ccc5b51.jpg,cat,https://c.pxhere.com/photos/37/30/cat_oriental_cute_white_animals_siamese_short_haired_kitten-1412250.jpg!d
5,scraped_images/cat/7de9c24e5c.jpg,cat,http://mymodernmet.com/wp/wp-content/uploads/2017/03/gabrielius-khiterer-stray-cats-11.jpg
6,scraped_images/cat/a6a645f3cd.jpg,cat,http://purrtacular.com/wp-content/uploads/2017/09/maya-extra-chromosome-cat-7.jpg
7,scraped_images/cat/d439145b0d.jpg,cat,https://scitechdaily.com/images/Cat-COVID-19-Mask.jpg
8,scraped_images/cat/4e0250916f.jpg,cat,https://www.askideas.com/media/25/American-Shorthair-Cat-Face-Picture.jpg
9,scraped_images/cat/a95061b4ae.jpg,cat,https://htc-wallpaper.com/wp-content/uploads/2015/05/Cute-cat-licking-its-paw.jpg


Let's see what's been saved to disk:

In [None]:
!ls {target_dir}/*

scraped_images/cat:
4e0250916f.jpg	74da23aa13.jpg	7de9c24e5c.jpg	a95061b4ae.jpg	d439145b0d.jpg
671ccc5b51.jpg	7655f2e055.jpg	a6a645f3cd.jpg	b3a14c4060.jpg	d4df5a839c.jpg

scraped_images/dog:
045bb79206.jpg	88bb07cac8.jpg	9b59f312a8.jpg	d264ab59b0.jpg	f8301d8187.jpg
1524e33153.jpg	8cee87adc0.jpg	ba0ce878e0.jpg	dae216cf27.jpg

scraped_images/horse:
21756204b2.jpg	3f1f84d4b9.jpg	6d8546a751.jpg	b821ac7db9.jpg
22e3ddd735.jpg	6851ee1b64.jpg	8adbd9567c.jpg


Now we should probably inspect the data to see if it looks good or if we accidentally grabbed images we don't want. 

### Extra: Interactive Image Browser (Slider)

In [None]:
# work in progress
class Category():
    def __init__(self):
        self.images = []
        self.urls = []

    def __len__(self):
        ni, nu = len(self.images), len(self.urls)
        #assert ni==nu
        return ni

# Load images from disk
dataset = {key: Category() for key in search_terms}
for term in search_terms:
    dir = term.replace(' ','_')  # spaces to underscores for disk access
    dir = f'{target_dir}/{dir}/'
    dataset[term].images = [Image.open(item) for i in [glob.glob(f'{dir}*.{ext}') for ext in ["jpg","gif","png","tga"]] for item in i]
    print(f'Loaded {len(dataset[term].images)} images for {term}')

Loaded 10 images for cat
Loaded 9 images for dog
Loaded 7 images for horse


In [None]:
%matplotlib inline

In [None]:
#export
def browse_images(dataset):
    print("Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.")
    @interact(term=search_terms)
    def _browse_images(term):
        n = len(dataset[term])
        def view_image(i):
            plt.imshow(dataset[term].images[i], cmap=plt.cm.gray_r, interpolation='nearest')
            plt.show()
        interact(view_image, i=(0,n-1))

In [None]:
browse_images(dataset)

Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.


interactive(children=(Dropdown(description='term', options=('cat', 'dog', 'horse'), value='cat'), Output()), _…

Let's make a bunch of thumbnails. So, for every file in target_path, load the image, shrink it, and save it to a similar filename in a similar directory structure 

In [None]:
#export 
try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

def get_thumb_urls(
    image_paths=None,                 # files we want; "None" = all in images_dir
    images_dir:str="scraped_images",  # directory of full size images, no / on end
    size:tuple=(75,75),             # max dims of thumbnail; see PIL Image.thumbnail()
    verbose:bool=False                # whether to print status messages or not
    ) -> list:
    """
    This will save thumbnails of images and provide 'hosted' urls to them if on Colab
    """

    thumbs_copy_dir = images_dir + "_thumbs"
    if IN_COLAB:
        print("Generating (URLS of) thumbnail images...")
        drive.mount('/gdrive')
        thumbs_copy_dir = '/gdrive/My Drive/'+ thumbs_copy_dir
    shutil.rmtree(thumbs_copy_dir, ignore_errors=True)      # clear out thumbs dir

    # get all the image filenames with full paths
    if image_paths is None:
        image_paths = [path for path in Path(images_dir).resolve().rglob('*') if path.suffix.lower() in ['.jpg', '.png']]
    
    # create the thumbnails and save them to Drive 
    thumb_paths = []
    for fname in image_paths:
        fname = Path(fname) # just as a precaution
        tname = Path(thumbs_copy_dir) / fname.parent.name / fname.name 
        tname.parent.mkdir(parents=True, exist_ok=True)  # create the parent directories before writing files
        with Image.open(fname) as im: 
            im.thumbnail(size)
            if verbose: print(f"Attempting to save {tname}")
            try:
                im.save(tname)
            except OSError:  # sometimes getting jpg save errors, try those as png 
                tname = Path(str(tname) +'.png')
                im.save(tname)
        thumb_paths.append(tname)

    if not IN_COLAB: return [str(t) for t in thumb_paths]  # For local runs, need to un-Path in order to serialize JSON
        
    print(f"Thumbnails saved to Google Drive in {thumbs_copy_dir}/\nWaiting on Google Drive until URLs are ready.\n")

    # get thumbnail URLs from Drive (might have to wait a bit for them)
    urls = []
    for tp in thumb_paths:
        count, timeout, fid = 0, 60, "local-225"  # need a loop in case Drive needs time to generate FileID
        while ('local-' in fid) and (count < timeout):
            fid, count = subprocess.getoutput(f"xattr -p 'user.drive.id' '{tp}' "), count+1
            if 'local-' in fid: time.sleep(1)      # url still isn't ready; wait a second
        if 'local-' in fid: 
            print(f"Error, unable to generate URL for {fid}")
        else:                                    
            urls.append(f'https://drive.google.com/uc?id={fid}')
            if verbose: print(f"url = {urls[-1]}")
            
    return urls

In [None]:
urls = get_thumb_urls(images_dir=target_dir)
print("urls = ",urls)
url = urls[0] if IN_COLAB else 'https://drive.google.com/uc?id=1owIYMyW7yaYlZ4QcJ8P3iIxkl_mCN-GX'
HTML(f"<img src={url}>")

Generating (URLS of) thumbnail images...
Mounted at /gdrive
Thumbnails saved to Google Drive in /gdrive/My Drive/scraped_images_thumbs/
Waiting on Google Drive until URLs are ready.

urls =  ['https://drive.google.com/uc?id=1-vlWsaAgAWUDqGU6QYEnE6dNc7FXOTIc', 'https://drive.google.com/uc?id=1-lIY7X34i05A0gn4b6PqOK5Fk3nuPdLD', 'https://drive.google.com/uc?id=1-kJRrd4p1EBgCUlMKWA0stS8aqdPuY3z', 'https://drive.google.com/uc?id=1-fEvf_HOQYNsk9-7k5Mi50eqsnWcgQe3', 'https://drive.google.com/uc?id=1-dVbr5RQ4qfFo2GY7m0w_jBvyzCVsfyS', 'https://drive.google.com/uc?id=1-dMbOceudlgpvtbuQ7T2hVQv6sVF_uga', 'https://drive.google.com/uc?id=1-cQ8bLx0P68BlriGnU9S-akARiUueRqE', 'https://drive.google.com/uc?id=1-bDmFoUTrAO6U7wTjQMDMKaV78YbRA59', 'https://drive.google.com/uc?id=1-WNGQ9gY8zmVjA6lKdV2EhF9Dyzm6c2x', 'https://drive.google.com/uc?id=1-WMPKgQVd5xs6Tgp5HqV0RyskfsZj152', 'https://drive.google.com/uc?id=1-VeIW67o5XjN3nZRrBSf687lVij-m6is', 'https://drive.google.com/uc?id=1-UxdWKYaldoUei5LMFKt-HN4f7b

In [None]:
#export 
def exhibit_urls(targ, labels=['cat','dog','horse']):
    """grabs a set of urls, in order of images that match the labels corresponding to targets"""
    
    dim = targ.max()+1
    url_store = [[] for t in range(dim)]
    for t in range(dim): # for each set of targets, scrape that many urls for the label
        label, n = labels[t], np.sum(targ == t )# count how many of each target there are
        url_store[t] = search_images_ddg(label)
    return [ url_store[targ[t]].pop(0) for t in range(len(targ)) ] # supply a url matching each target

In [None]:
prob, targ = calc_prob(n=400)
urls = exhibit_urls(targ, ['cat','dog','horse'])
assert len(targ) == len(urls)