In [None]:
# default_exp scrape

In [None]:
#hide 
# Do this on Colab and then restart runtime:
#!pip install fastai --upgrade | grep -v 'already satisfied'
#!pip install mrspuff --upgrade | grep -v 'already satisfied'

# scrape

> This is a collection of routines for dataset-building via web scraping, intended to be run on Google Colab

This is modified from Google-scraping code first shared [by akashgshastri on Fast.ai forums](https://forums.fast.ai/t/google-image-scraper/79682/6) which I then updated.

*Note: Turns out that Jeremy Howard & Sylvain Gugger had already taught scraping in the [2020 version of the FastAI course](https://github.com/fastai/course2020), and provided some useful routines. So, the code I'd had written previously, I'm going to remove and replace with slighly modified versions of theirs.  (In particular, their DuckDuckGo scraper doesn't require any Selenium ChromeDriver like the Google-scraping code I'd written. Which is great because that was messing up the CI on GitHub anyway.)*



In [None]:
#export
from fastcore.basics import *
from fastai.vision.all import *
import numpy as np 
import re
import requests 
import json 
import os, io  
from PIL import Image, ImageOps
import hashlib
import shutil
import glob
from pathlib import Path
import subprocess
import time 
from IPython.display import HTML
import matplotlib.pyplot as plt
from ipywidgets import interact
from mrspuff.utils import calc_prob, on_colab 
import pandas as pd

In [None]:
#export

#modified from fastbook utils, https://github.com/fastai/course20/blob/master/fastbook/__init__.py
#by Jeremy Howard and Sylvain Gugger.  Just removed the .decode() formatting, and replaced L() with list()
def search_images_ddg(key,max_n=200):
    """By Howard & Gugger: Search for 'key' with DuckDuckGo and return a unique urls of 'max_n' images
    (Adopted from https://github.com/deepanprabhu/duckduckgo-images-api)
    """
    url        = 'https://duckduckgo.com/'
    params     = {'q':key}
    res        = requests.post(url,data=params)
    searchObj  = re.search(r'vqd=([\d-]+)\&',res.text)
    if not searchObj: print('Token Parsing Failed !'); return
    requestUrl = url + 'i.js'
    headers    = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}
    params     = (('l','us-en'),('o','json'),('q',key),('vqd',searchObj.group(1)),('f',',,,'),('p','1'),('v7exp','a'))
    urls       = []
    while True:
        try:
            res  = requests.get(requestUrl,headers=headers,params=params)
            data = json.loads(res.text)
            for obj in data['results']:
                urls.append(obj['image'])
                max_n = max_n - 1
                if max_n < 1: return list(set(urls))     # dedupe
            if 'next' not in data: return list(set(urls))
            requestUrl = url + data['next']
        except:
            pass

In [None]:
#export 
def download_and_save(folder_path:str, url:str, verbose:bool=True):
    success = False
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")
    
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        #if verbose:  print(f"SUCCESS - saved {url} - as {file_path}")
        success = True
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")
        file_path = ''
    return file_path
    

def search_and_download(search_term:str, df=None, target_path:str='./images', num_images:int=10, verbose:bool=True):
    
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    try_urls = search_images_ddg(search_term, max_n=num_images)
    print(f"...got {len(try_urls)} urls for term '{search_term}'")

    if df is None: df = pd.DataFrame(columns = ['file_path', 'label', 'orig_url'])
    for url in try_urls:
        file_path = download_and_save(target_folder, url, verbose=verbose)
        if file_path != '':
            df = df.append({ 'file_path' : file_path, 'label':search_term, 'orig_url' : url}, ignore_index = True)
    
    if verbose: print(f"{search_term}: Expected {num_images}, succeeded at saving {len(df)}.")
    return df   # return a dataframe storing successully downloaded files and urls they came from


In [None]:
#export
def img_scrape(search_terms:list, target_path:str='./.images', num_images:int=10, verbose:bool=True):
    
     # clear out directory before use
    for category_path in glob.glob(os.path.join(target_path, "*")):
        shutil.rmtree(category_path)
        
    df = pd.DataFrame(columns = ['file_path', 'label', 'orig_url'])
    for term in search_terms:
        if verbose: print(f"Searching on term '{term}'...")
        #df = pd.concat([df, search_and_download(search_term = term, target_path=target_path, num_images=num_images)], axis=0)
        df = df.append(search_and_download(search_term = term, target_path=target_path, num_images=num_images), ignore_index = True)

    return df # return list of file paths and original urls

In [None]:
'''
### Variables to adjust

1. set search_term to an array of strings for which you want images
2. set num_images to the number of images you want for each class
3. set target_path to the path where you want images dataset created.
'''
search_terms = ['cat','dog','horse']                    
#search_terms = ["les paul guitar", "stratocaster guitar"] # H/T Nathan Sepulveda
#search_terms = ["alligator", "crocodile"]                 # tricky
#search_terms = ["blue sky", "stop sign"]                  # easy: these separate by color!
#search_terms = ["smart person", "stupid person"]          # this is going to be a bad idea! (ethics)

num_images = 10
target_dir = 'scraped_images'            # where to save to

In [None]:
df = img_scrape(search_terms, target_path=target_dir, num_images=num_images, verbose=True)
print(len(df),"images")
df

Searching on term 'cat'...
...got 10 urls for term 'cat'
ERROR - Could not save https://www.siberiancatsinformation.com/wp-content/uploads/2020/06/45502005281_db333d028c_o-scaled.jpg - cannot identify image file <_io.BytesIO object at 0x7f7af0f20e90>
cat: Expected 10, succeeded at saving 9.
Searching on term 'dog'...
...got 10 urls for term 'dog'
ERROR - Could not save https://mooshme.com/wp-content/uploads/2017/02/MooshMe-Collie.jpg - cannot identify image file <_io.BytesIO object at 0x7f7af0eb4a70>
dog: Expected 10, succeeded at saving 9.
Searching on term 'horse'...
...got 10 urls for term 'horse'
ERROR - Could not save http://www.hdwallpaperslife.com/wp-content/uploads/2019/02/5c6a9f8635e6e.jpg - cannot identify image file <_io.BytesIO object at 0x7f7af0eb4cb0>
ERROR - Could not save https://horsebreedslist.com/wp-content/uploads/2021/01/arab-horse.jpg - cannot identify image file <_io.BytesIO object at 0x7f7af0eb4cb0>
ERROR - Could not save https://horsespirit.site/wp-content/uplo

Unnamed: 0,file_path,label,orig_url
0,scraped_images/cat/7f429fb7c2.jpg,cat,"https://www.thesprucepets.com/thmb/5tDHikYVxrJXwQCgVhacYo5PS4g=/2121x1414/filters:fill(auto,1)/Madelein_WolfGettyImages-845007724-cff4995e51a749e0a69f529c75447f5b.jpg"
1,scraped_images/cat/384ed12bf5.jpg,cat,https://d.newsweek.com/en/full/1504969/black-cat-kitten-pet-animal-stock-getty.jpg
2,scraped_images/cat/6ea6b56a70.jpg,cat,https://ukmadcat.com/wp-content/uploads/2019/04/sleepy-cat.jpg
3,scraped_images/cat/9f56fe3cc7.jpg,cat,https://pbs.twimg.com/media/DUfDNvTVQAA2osS.jpg:large
4,scraped_images/cat/dbf944bb39.jpg,cat,http://www.trbimg.com/img-5a68a878/turbine/ct-grumpy-cat-lawsuit-20180124
5,scraped_images/cat/e221f55e4e.jpg,cat,https://wallup.net/wp-content/uploads/2015/06/Goldish-cat-smiles.jpg
6,scraped_images/cat/20c65dbd42.jpg,cat,https://pixnio.com/free-images/2017/09/26/2017-09-26-07-19-47.jpg
7,scraped_images/cat/df448c01e0.jpg,cat,http://pulpbits.net/wp-content/uploads/2014/01/tabby-cat.jpg
8,scraped_images/cat/10448ff709.jpg,cat,https://petsnurturing.com/wp-content/uploads/2018/08/Spotted-cat-breeds-4.jpg
9,scraped_images/dog/24dd31c9c4.jpg,dog,https://wallup.net/wp-content/uploads/2014/10/animal/Cute_Golden_Dog.jpg


Let's see what's been saved to disk:

In [None]:
!ls {target_dir}/*

scraped_images/cat:
10448ff709.jpg	384ed12bf5.jpg	7f429fb7c2.jpg	dbf944bb39.jpg	e221f55e4e.jpg
20c65dbd42.jpg	6ea6b56a70.jpg	9f56fe3cc7.jpg	df448c01e0.jpg

scraped_images/dog:
1a890e08f2.jpg	2d03121fbf.jpg	728c8698ca.jpg	ca2fa0579f.jpg	e9aac69517.jpg
24dd31c9c4.jpg	5b4e30ce21.jpg	766000910b.jpg	ccf73ae866.jpg

scraped_images/horse:
138ab98b77.jpg	3fa874e935.jpg	54d20582ec.jpg	e53be8dea6.jpg
39e6833389.jpg	4f694a7ca8.jpg	9f3ea2cc68.jpg


Now we should probably inspect the data to see if it looks good or if we accidentally grabbed images we don't want. 

### Extra: Interactive Image Browser (Slider)

In [None]:
# work in progress
class Category():
    def __init__(self):
        self.images = []
        self.urls = []

    def __len__(self):
        ni, nu = len(self.images), len(self.urls)
        #assert ni==nu
        return ni

# Load images from disk
dataset = {key: Category() for key in search_terms}
for term in search_terms:
    dir = term.replace(' ','_')  # spaces to underscores for disk access
    dir = f'{target_dir}/{dir}/'
    dataset[term].images = [Image.open(item) for i in [glob.glob(f'{dir}*.{ext}') for ext in ["jpg","gif","png","tga"]] for item in i]
    print(f'Loaded {len(dataset[term].images)} images for {term}')

Loaded 9 images for cat
Loaded 9 images for dog
Loaded 7 images for horse


In [None]:
%matplotlib inline

In [None]:
#export
def browse_images(dataset):
    print("Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.")
    @interact(term=search_terms)
    def _browse_images(term):
        n = len(dataset[term])
        def view_image(i):
            plt.imshow(dataset[term].images[i], cmap=plt.cm.gray_r, interpolation='nearest')
            plt.show()
        interact(view_image, i=(0,n-1))

In [None]:
browse_images(dataset)

Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.


interactive(children=(Dropdown(description='term', options=('cat', 'dog', 'horse'), value='cat'), Output()), _…

Let's make a bunch of thumbnails. So, for every file in target_path, load the image, shrink it, and save it to a similar filename in a similar directory structure 

In [None]:
#export 

try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

def get_thumb_urls(
    image_paths=None,                 # files we want; "None" = all in images_dir
    images_dir:str="scraped_images",  # directory of full size images, no / on end
    size:tuple=(100,100),             # max dims of thumbnail; see PIL Image.thumbnail()
    verbose:bool=False                # whether to print status messages or not
    ) -> list:
    """
    This will save thumbnails of images and provide 'hosted' urls to them if on Colab
    """

    thumbs_copy_dir = images_dir + "_thumbs"
    if IN_COLAB:
        print("Generating (URLS of) thumbnail images...")
        drive.mount('/gdrive')
        thumbs_copy_dir = '/gdrive/My Drive/'+ thumbs_copy_dir
    shutil.rmtree(thumbs_copy_dir, ignore_errors=True)      # clear out thumbs dir

    # get all the image filenames with full paths
    if image_paths is None:
        image_paths = [path for path in Path(images_dir).resolve().rglob('*') if path.suffix.lower() in ['.jpg', '.png']]
    
    # create the thumbnails and save them to Drive 
    thumb_paths = []
    for fname in image_paths:
        fname = Path(fname) # just as a precaution
        tname = Path(thumbs_copy_dir) / fname.parent.name / fname.name 
        tname.parent.mkdir(parents=True, exist_ok=True)  # create the parent directories before writing files
        with Image.open(fname) as im: 
            im.thumbnail(size)
            if verbose: print(f"Attempting to save {tname}")
            try:
                im.save(tname)
            except OSError:  # sometimes getting jpg save errors, try those as png 
                tname = Path(str(tname) +'.png')
                im.save(tname)
        thumb_paths.append(tname)

    if not IN_COLAB: return [str(t) for t in thumb_paths]  # For local runs, need to un-Path in order to serialize JSON
        
    print(f"Thumbnails saved to Google Drive in {thumbs_copy_dir}/\nWaiting on Google Drive until URLs are ready.\n")

    # get thumbnail URLs from Drive (might have to wait a bit for them)
    urls = []
    for tp in thumb_paths:
        count, timeout, fid = 0, 60, "local-225"  # need a loop in case Drive needs time to generate FileID
        while ('local-' in fid) and (count < timeout):
            fid, count = subprocess.getoutput(f"xattr -p 'user.drive.id' '{tp}' "), count+1
            if 'local-' in fid: time.sleep(1)      # url still isn't ready; wait a second
        if 'local-' in fid: 
            print(f"Error, unable to generate URL for {fid}")
        else:                                    
            urls.append(f'https://drive.google.com/uc?id={fid}')
            if verbose: print(f"url = {urls[-1]}")
            
    return urls

In [None]:
urls = get_thumb_urls(images_dir=target_dir)
print("urls = ",urls)
url = urls[0] if IN_COLAB else 'https://drive.google.com/uc?id=1owIYMyW7yaYlZ4QcJ8P3iIxkl_mCN-GX'
HTML(f"<img src={url}>")

Generating (URLS of) thumbnail images...
Mounted at /gdrive
Thumbnails saved to Google Drive in /gdrive/My Drive/scraped_images_thumbs/
Waiting on Google Drive until URLs are ready.

urls =  ['https://drive.google.com/uc?id=102OW28_qsr7vr0dfHeUUQM0yeox2MIL7', 'https://drive.google.com/uc?id=100WfSNpv6qg1NL7yRlHyac8TfMzANURl', 'https://drive.google.com/uc?id=1-zMDtc2ewp-Gx_VX7GZP_ukLG6fVyerR', 'https://drive.google.com/uc?id=1-ubtGLq03kIwFX51BoNY2w_Wr0gtZAOY', 'https://drive.google.com/uc?id=1-jdjUNmo7ICJc82YJrmZ1jnJ_u2lJY5Y', 'https://drive.google.com/uc?id=1-hObfXam8rLUpAVc-XvYaDXxdTzYBHlz', 'https://drive.google.com/uc?id=1-hOQukQaQqh_g1l80NIvPHaV0VYMPkyR', 'https://drive.google.com/uc?id=1-gbieUEfx3n_-ExMDtyKA3JaTkoxSUq4', 'https://drive.google.com/uc?id=1-gUUCLLKCGszZonChgYarKTLTZvpL4Qp', 'https://drive.google.com/uc?id=1-e0uprmgXCcr9nMMOmMNftnE3lSHGZPT', 'https://drive.google.com/uc?id=1-cm-llHr2xXGt7DUAWpFHzUwHqJB_WBq', 'https://drive.google.com/uc?id=1-c4UeyEk03VGLEtXNmwLJGOekqW

In [None]:
#export 
def exhibit_urls(targ, labels=['cat','dog','horse']):
    """grabs a set of urls, in order of images that match the labels corresponding to targets"""
    
    dim = targ.max()+1
    url_store = [[] for t in range(dim)]
    for t in range(dim): # for each set of targets, scrape that many urls for the label
        label, n = labels[t], np.sum(targ == t )# count how many of each target there are
        url_store[t] = search_images_ddg(label)
    return [ url_store[targ[t]].pop(0) for t in range(len(targ)) ] # supply a url matching each target

In [None]:
prob, targ = calc_prob(n=400)
urls = exhibit_urls(targ, ['cat','dog','horse'])
assert len(targ) == len(urls)

In [None]:
# export 

# actually here's a newer interface that I prefer
def scrape_for_me(dl_path, labels, search_suffix, erase_dir=True, max_n=100):
    if erase_dir:
        shutil.rmtree(dl_path, ignore_errors=True)
    path = Path(dl_path)
    if not path.exists(): path.mkdir()
    for o in labels:            # scrape images off the web
        search_term = (f'{o} {search_suffix}').strip()
        dest = (path/o)
        dest.mkdir(exist_ok=True)
        urls = search_images_ddg(f'{search_term}', max_n=max_n) 
        urls = [ x for x in urls if ("magpies" not in x) and ("charliebears" not in x) ]   # kludge for now to keep download_images from hanging
        print(f"{search_term}: Got {len(urls)} image URLs. Downloading...")
        print("   urls = ",urls)
        download_images(dest, urls=urls, preserve_filename=False)
        print("    Images downloaded.")
        
    fns = get_image_files(path)     # list image filenames
    failed = verify_images(fns)     # check if any are unloadable

    # remove what's unloadable
    failed.map(Path.unlink);
    if failed != []:
        _ = [fns.remove(f) for f in failed]

    # Extra: To avoid Transparency warnings, convert PNG images to RGBA, from https://forums.fast.ai/t/errors-when-training-the-bear-image-classification-model/83422/9
    converted = L()
    for image in fns:
        if '.png' in str(image):
            im = Image.open(image)
            converted.append(image)  # old file name before resaving
            im.convert("RGBA").save(f"{image}2.png")    
    converted.map(Path.unlink); # delete originals
    print(f"After checking images for issues, {len(get_image_files(path))} (total) images remain.")
    return path     # and return a pathlib object pointing to image dir

In [None]:
dl_path = 'scraped_images'  # where we're saving to
labels = 'grizzly','black','teddy'  
search_suffix = 'bear'
path = scrape_for_me(dl_path, labels, search_suffix, max_n=30)

grizzly bear: Got 30 image URLs. Downloading...
   urls =  ['http://animalia-life.com/data_images/grizzly-bear/grizzly-bear5.jpg', 'https://ewscripps.brightspotcdn.com/dims4/default/62a4e0f/2147483647/strip/true/crop/1280x720+0+0/resize/1280x720!/quality/90/?url=http:%2F%2Fewscripps-brightspot.s3.amazonaws.com%2F36%2Fec%2F6e01f9b94305b1d3578f2458629b%2Fgrizzly-bear.png', 'https://wallsdesk.com/wp-content/uploads/2017/01/Grizzly-Bear-Wallpapers-HD.jpg', 'https://reidparkzoo.org/wp-content/uploads/2013/09/grzbear2812a.jpg', 'https://www.animalfactsencyclopedia.com/images/grizzlybearstanding.jpg.pagespeed.ce.7jsWfY9XXY.jpg', 'http://mediad.publicbroadcasting.net/p/kufm/files/styles/x_large/public/201606/grizzly_PD.jpg', 'https://wallsdesk.com/wp-content/uploads/2017/01/Grizzly-Bear-Desktop-Wallpaper-.jpg', 'https://www.conservationnw.org/wp-content/uploads/2017/10/grizzly-bears-1280x950.jpg', 'https://www.montanaoutdoor.com/wp-content/uploads/2018/02/Grizzly-King-FEAT.jpg', 'https://1.bp.