In [None]:
# default_exp scrape

# scrape

> This is a collection of routines for dataset-building via web scraping, intended to be run on Google Colab

This is modified from Google-scraping code first shared [by akashgshastri on Fast.ai forums](https://forums.fast.ai/t/google-image-scraper/79682/6) which I then updated.

*Note: Turns out that Jeremy Howard & Sylvain Gugger had already taught scraping in the [2020 version of the FastAI course](https://github.com/fastai/course2020), and provided some useful routines. So, the code I'd had written previously, I'm going to remove and replace with slighly modified versions of theirs.  (In particular, their DuckDuckGo scraper doesn't require any Selenium ChromeDriver like the Google-scraping code I'd written. Which is great because that was messing up the CI on GitHub anyway.)*



In [None]:
#export
import re
import requests 
import json 
#from fastai.fastcore import *

#modified from fastbook utils, https://github.com/fastai/course20/blob/master/fastbook/__init__.py
#by Jeremy Howard and Sylvain Gugger.  Just removed the .decode() formatting, and replaced L() with list()
def search_images_ddg(key,max_n=200):
    """Search for 'key' with DuckDuckGo and return a unique urls of 'max_n' images
    (Adopted from https://github.com/deepanprabhu/duckduckgo-images-api)
    """
    url        = 'https://duckduckgo.com/'
    params     = {'q':key}
    res        = requests.post(url,data=params)
    searchObj  = re.search(r'vqd=([\d-]+)\&',res.text)
    if not searchObj: print('Token Parsing Failed !'); return
    requestUrl = url + 'i.js'
    headers    = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}
    params     = (('l','us-en'),('o','json'),('q',key),('vqd',searchObj.group(1)),('f',',,,'),('p','1'),('v7exp','a'))
    urls       = []
    while True:
        try:
            res  = requests.get(requestUrl,headers=headers,params=params)
            data = json.loads(res.text)
            for obj in data['results']:
                urls.append(obj['image'])
                max_n = max_n - 1
                if max_n < 1: return list(set(urls))     # dedupe
            if 'next' not in data: return list(set(urls))
            requestUrl = url + data['next']
        except:
            pass

In [None]:
#export 
import os, io  
from PIL import Image, ImageOps
import hashlib


def download_and_save(folder_path:str, url:str, verbose:bool=True):
    success = False
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")
        
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        if verbose:  print(f"SUCCESS - saved {url} - as {file_path}")
        success = True
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")
    return success
    

def search_and_download(search_term:str, target_path:str='./images', number_images:int=10, verbose:bool=True):
    
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    try_urls = search_images_ddg(search_term, max_n=number_images)
    print(f"...got {len(try_urls)} urls for term '{search_term}'")

    count, urls = 0, []      # count success and urls whose images were successfully saved
    for url in try_urls:
        rc = download_and_save(target_folder, url, verbose=verbose)
        if rc:
            count += 1
            urls.append(url)
    
    if verbose: print(f"{search_term}: Expected {number_images}, succeeded at saving {count}.")
    return count, urls 


# work inprogress
class Category():
    def __init__(self):
        self.images = []
        self.urls = []

    def __len__(self):
        ni, nu = len(self.images), len(self.urls)
        #assert ni==nu 
        return ni

In [None]:
#export
import shutil
import glob


def img_scrape(search_terms:list, target_path:str='./.images', number_images:int=10, verbose:bool=True):
    
     # clear out directory before use
    for category_path in glob.glob(os.path.join(target_path, "*")):
        shutil.rmtree(category_path)
        
    dataset = {key: Category() for key in search_terms}
    
    for term in search_terms:
        if verbose: print(f"Searching on term '{term}'...")
        count, urls = search_and_download(search_term = term, target_path=target_path, number_images=number_images)
        dataset[term].urls = urls   # save urls in case we want them later

    return dataset

In [None]:
'''
### Variables to adjust

1. set search_term to an array of strings for which you want images
2. set number_images to the number of images you want for each class
3. set target_path to the path where you want images dataset created.
'''
search_terms = ["dog", "cat", "horse"]                     
#search_terms = ["les paul guitar", "stratocaster guitar"] # H/T Nathan Sepulveda
#search_terms = ["alligator", "crocodile"]                 # tricky
#search_terms = ["blue sky", "stop sign"]                  # easy: these separate by color!
#search_terms = ["smart person", "stupid person"]          # this is going to be a bad idea! (ethics)

number_images = 10
target_dir = 'scraped_images'            # where to save to

In [None]:
dataset = img_scrape(search_terms, target_path=target_dir, number_images=number_images)

Searching on term 'dog'...
...got 10 urls for term 'dog'
SUCCESS - saved http://dogdayzgrooming.com/wp-content/gallery/home-gallery/Laughing-Golden-1140x760.jpg - as scraped_images/dog/6a374b11d6.jpg
SUCCESS - saved https://www.sciencealert.com/images/articles/processed/dog-doubt_1024.jpg - as scraped_images/dog/dae216cf27.jpg
SUCCESS - saved https://upload.wikimedia.org/wikipedia/commons/a/a5/Newfoundland_dog_Smoky.jpg - as scraped_images/dog/b80c464eef.jpg
SUCCESS - saved https://www.keystonepuppies.com/wp-content/uploads/2019/01/boxer-dog-grown.jpg - as scraped_images/dog/1524e33153.jpg
SUCCESS - saved https://www.stevedalepetworld.com/wp-content/uploads/2019/02/dog-in-shelter.jpg - as scraped_images/dog/006e6afb04.jpg
SUCCESS - saved http://www.rspcasa.org.au/wp-content/uploads/2018/11/Puppy_dogtraining.jpg - as scraped_images/dog/c33630fd6f.jpg
SUCCESS - saved https://www.pdsa.org.uk/media/4032/german-shepherd-dog-og.png - as scraped_images/dog/1c71edd98f.jpg
SUCCESS - saved http:

Let's see what's been saved to disk:

In [None]:
!ls {target_dir}/*

scraped_images/cat:
253ac3f03b.jpg	5db5815f59.jpg	7655f2e055.jpg	a95061b4ae.jpg	d4df5a839c.jpg
4e0250916f.jpg	617aa15b48.jpg	a6a645f3cd.jpg	d439145b0d.jpg

scraped_images/dog:
006e6afb04.jpg	1c71edd98f.jpg	6cfef71a27.jpg	b80c464eef.jpg	dae216cf27.jpg
1524e33153.jpg	6a374b11d6.jpg	88bb07cac8.jpg	c33630fd6f.jpg	de55a23f16.jpg

scraped_images/horse:
3f1f84d4b9.jpg	6851ee1b64.jpg	8f588bdb25.jpg	b821ac7db9.jpg
451d5d013d.jpg	8adbd9567c.jpg	96bce81f43.jpg	ece8ee9e61.jpg


Now we should probably inspect the data to see if it looks good or if we accidentally grabbed images we don't want. 

### Extra: Interactive Image Browser (Slider)

In [None]:
import numpy as np 

# Load images from disk
for term in search_terms:
    dir = term.replace(' ','_')  # spaces to underscores for disk access
    dir = f'{target_dir}/{dir}/'
    dataset[term].images = [Image.open(item) for i in [glob.glob(f'{dir}*.{ext}') for ext in ["jpg","gif","png","tga"]] for item in i]
    print(f'Loaded {len(dataset[term].images)} images for {term}')

Loaded 10 images for dog
Loaded 9 images for cat
Loaded 8 images for horse


In [None]:
%matplotlib inline

In [None]:
#export
import matplotlib.pyplot as plt
from ipywidgets import interact
def browse_images(dataset):
    print("Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.")
    @interact(term=search_terms)
    def _browse_images(term):
        n = len(dataset[term])
        def view_image(i):
            plt.imshow(dataset[term].images[i], cmap=plt.cm.gray_r, interpolation='nearest')
            plt.show()
        interact(view_image, i=(0,n-1))

In [None]:
browse_images(dataset)

Select the class from the drop-down, and the image by moving the slider with the mouse or the arrow keys.


interactive(children=(Dropdown(description='term', options=('dog', 'cat', 'horse'), value='dog'), Output()), _…

Let's make a bunch of thumbnails. So, for every file in target_path, load the image, shrink it, and save it to a similar filename in a similar directory structure 

In [None]:
# hide
#!pip install xattr

In [None]:
#export 
from pathlib import Path
import subprocess
import shutil 
import time 
from IPython.display import HTML

try:
  from google.colab import drive
  IN_COLAB = True
except:
  IN_COLAB = False


def get_thumb_urls(
    images_dir:str="scraped_images",  # directory of full size images, no / on end
    verbose:bool=False                # whether to print status messages or not
    ) -> list:

    if not IN_COLAB:
        print("Sorry, this only works on Colab")
        return None 
        
    drive.mount('/gdrive')
    thumbs_copy_dir = '/gdrive/My Drive/'+ images_dir + "_thumbs"
    shutil.rmtree(thumbs_copy_dir, ignore_errors=True)      # clear out thumbs dir

    # get all the image filenames with full paths
    image_paths = [path for path in Path(images_dir).resolve().rglob('*') if path.suffix.lower() in ['.jpg', '.png']]
    
    # create the thumbnails and save them to Drive 
    thumb_paths = []
    for f in image_paths:
        t = Path(thumbs_copy_dir) / f.parent.name / f.name 
        thumb_paths.append(t)
        t.parent.mkdir(parents=True, exist_ok=True)  # create the parent directories before writing files
        with Image.open(f) as im: 
            im.thumbnail((150,150))
            im.save(t)
    print(f"Thumbnails saved to Google Drive in {thumbs_copy_dir}. Waiting til URLs are ready...")

    # get the urls to the thumbnails from drive
    urls = []
    for tp in thumb_paths:
        count, fid = 0, "local-225"  # need a loop in case Drive needs time to generate FileID
        while ('local-' in fid) and (count < 100):
            fid, count = subprocess.getoutput(f"xattr -p 'user.drive.id' '{tp}' "), count+1
            if 'local-' in fid: time.sleep(1)
        url = f'https://drive.google.com/uc?id={fid}'
        urls.append(url)
        if verbose: print(f"url = {url}")
    return urls

In [None]:
urls = get_thumb_urls(target_dir)
print("urls = ",urls)
url = urls[0] if IN_COLAB else 'https://drive.google.com/uc?id=1owIYMyW7yaYlZ4QcJ8P3iIxkl_mCN-GX'
HTML(f"<img src={url}>")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
Thumbnails saved to Google Drive in /gdrive/My Drive/scraped_images_thumbs. Waiting til URLs are ready...
urls =  ['https://drive.google.com/uc?id=1teWuIzJlQFbCL5x3_AHeZAB6w9vF5JCi', 'https://drive.google.com/uc?id=1tVtkkB5H4ZOlRtXvHicO528ug8QC43a5', 'https://drive.google.com/uc?id=1tUn4rbFWwQwVTc9G5ewHKJ5fqAazTP8g', 'https://drive.google.com/uc?id=1tSH4-vEbTzxdEcUom6PFuyzZcZEE1LAj', 'https://drive.google.com/uc?id=1tLU3aCozF9G4yKw0zqFK6Wd-yMmloapJ', 'https://drive.google.com/uc?id=1tG2HAjk0weIaKz2LgjgWTIGI67bT4bjK', 'https://drive.google.com/uc?id=1t9idauxJWC7uIbYK9z9twmNb1cOg1c8K', 'https://drive.google.com/uc?id=1t5jjtOD2sNA9zqTCsZ-seHm3RsF0LiWK', 'https://drive.google.com/uc?id=1t0UswWvvpUCG4GsR-i008D4YUgQpMhUG', 'https://drive.google.com/uc?id=1syM4M_-LSSvrTkzYcnIGSdduXhmt3nhl', 'https://drive.google.com/uc?id=1swqvr95eJxE7bqHeVAEzIMAvQIY-5Opl', 'https://drive.google.