In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
from bs4 import BeautifulSoup
import urllib
from pathlib import Path
import os
import sys
import json
from tqdm import tqdm

# constants
repo_path = Path('/Users/etriesch/dev/tree-finder')

In [2]:
# set up a profile to access website
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

# Arborday website
https://shop.arborday.org/

In [3]:
def get_arbor_tree_images(url, tree):
    # load page
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    image_tags = soup.find_all('img')
    # get all image links
    links = []
    for image_tag in image_tags:
        links.append(image_tag['src'])
    # subset to tree image links
    tree_links = [link for link in links if any(word in link.lower() for word in tree.split('-'))]
    return tree_links

In [4]:
# constants
source = 'arborday'
trees = ['black locust', 'black walnut', 'honey locust', 'loblolly pine', 'northern red oak', 'pecan', 'chinese chestnut']
tree_in_db = [False, True, False, True, True, True, True]

In [5]:
# loop through all trees and download
for t in trees:
    # set up tree and url
    tree = t.replace(' ', '-')
    url = f'https://shop.arborday.org/{tree}'
    
    # get links to tree images
    tree_links = get_arbor_tree_images(url, tree)
    print(f'pulling {len(tree_links)} {tree}s from {url}')

    # ARBOR DAY SPECIFIC: for each image, substitute thumbnail for large image
    tree_links = [link.replace('105.jp','510.jp') for link in tree_links]

    # loop through images and download them
    for i, l in enumerate(tree_links):
        urllib.request.urlretrieve(l, repo_path / 'data' / f'{source}-{t.replace(" ", "")}-{i}.jpg')

pulling 0 black-locusts from https://shop.arborday.org/black-locust
pulling 4 black-walnuts from https://shop.arborday.org/black-walnut
pulling 0 honey-locusts from https://shop.arborday.org/honey-locust
pulling 7 loblolly-pines from https://shop.arborday.org/loblolly-pine
pulling 7 northern-red-oaks from https://shop.arborday.org/northern-red-oak
pulling 6 pecans from https://shop.arborday.org/pecan
pulling 3 chinese-chestnuts from https://shop.arborday.org/chinese-chestnut


# Harvard Arboretum
http://arboretum.harvard.edu/

In [16]:
def get_harvard_tree_images(search_url, image_base_url):
    page = requests.get(search_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    page_tags = soup.find_all('img')
    
    # get all image links
    search_links = []
    for page_tags in page_tags:
        search_links.append(page_tags['src'])

    # subset to links from the plant library
    tree_search_links = [link for link in search_links if '/plant/img/' in link]
    
    # get large images
    tag_start = '/img/aaimg/'
    tag_end = '.mid_200'
    tree_links = []
    for tl in tree_search_links:
        if tl.endswith('lg.jpg'):
            tree_links += [tl]
        else:
            image_page_label = tl[tl.find(tag_start)+len(tag_start):tl.find(tag_end)]
            image_page_url = image_base_url + image_page_label
            page = requests.get(image_page_url)
            soup = BeautifulSoup(page.content, 'html.parser')
            image_tags = soup.find_all('img')
            for image_tags in image_tags:
                tree_links.append(image_tags['src'])
    
    return tree_links

In [17]:
# constants
source = 'harvard'
trees_old = ['black locust', 'black walnut', 'honey locust', 'loblolly pine', 'northern red oak', 'pecan', 'chinese chestnut']
trees = ['black locust', 'black walnut', 'honey locust', 'loblolly pine', 'northern red oak', 'pecan', 'castanea']
tree_in_db = [True, True, False, False, True, True, True]

In [18]:
for tree in trees:
    # get list of tree URLs
    search_url = f'http://arboretum.harvard.edu/plants/image-search/?keyword={tree.replace(" ","+")}&image_per_page=1000'
    image_base_url = f'https://arboretum.harvard.edu/plants/image-search/?keyword={tree.replace(" ","+")}&search_type=indiv_img&image_key='
    tree_links = get_harvard_tree_images(search_url, image_base_url)
    print(f'pulling {len(tree_links)} {tree}s from {search_url}')
    
    # loop through images and download them
    for i, l in enumerate(tree_links):
        urllib.request.urlretrieve(l, repo_path / 'data' / f'{source}-{tree.replace(" ", "")}-{i}.jpg')

http://labs.arboretum.harvard.edu/plant/img/aaimg/164bd64e0caa71ded3e8719e9e697011.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/14d1e3e5bc92a1ad15ea6566ff72f8b0.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/58d1b27fa03f5fa18ec809279a4945ef.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/17ad4610946eb83285607e698fe2430d.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/c7c3be207b2360e6584649abe3cf4e49.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/0a840b36b5a1755862e73fe8edbebd90.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/1da3a6a7804ed9e4f051d6d70f004abf.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/446bb415507d4e511265df8f88879d8c.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/1733f6d36f7e88d85286d4b7c07c7fa5.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/96f294307669837629a7902d4e993675.lg.jpg
http://labs.arboretum.harvard.edu/plant/img/aaimg/62d32755153ca7476b15419808d0fc7c.lg.jpg
http://lab

# Bing search images

In [3]:
# https://gist.github.com/stephenhouser/c5e2b921c3770ed47eb3b75efbc94799
def get_soup(url,header):
    #return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),
    # 'html.parser')
    return BeautifulSoup(urllib.request.urlopen(
        urllib.request.Request(url,headers=header)),
        'html.parser')

In [4]:
def get_bing_image_urls(query, num_pages=1, start_page=0, per_page=75):
    '''query : search terms, as they would appear in Bing'''
    query= query.split()
    query='+'.join(query)
    header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
    
    ActualImages=[]# contains the link for Large original images, type of  image
    
    url = f'http://www.bing.com/images/search?q={query}qft=+filterui:imagesize-large&form=IRFLTR'
    url_II = f'http://www.bing.com/images/search?q={query}&first=1000' #qft=+filterui:imagesize-large&form=IRFLTR'
    for n in range(1, num_pages+1):
        url_II = f'https://www.bing.com/images/async?q={query}&first={per_page*n+start_page}&count={per_page}'
        soup = get_soup(url_II, header)

        for a in soup.find_all("a",{"class":"iusc"}):
            m = json.loads(a['m'])
            murl = m['murl']
            jpg_loc = murl.lower().find('.jpg')
            if jpg_loc > 0:
                murl = murl[0:jpg_loc+4]

            image_name = urllib.parse.urlsplit(murl).path.split("/")[-1]
            image_desc = m['desc']

            ActualImages.append((image_name, murl, image_desc))

    print(f'pulled {len(ActualImages)} image urls\n---- Downloading ----')
    
    return ActualImages

# test the function
# blackwalnut = get_bing_image_urls('black walnut tree', 2)

In [35]:
def save_bing_images(label, image_urls, path):

    image_path = path / 'data' / 'images' / label
    metadata_path = path / 'data' / 'metadata'

    if not os.path.exists(image_path):
        os.mkdir(image_path)
    if not os.path.exists(metadata_path):
        os.mkdir(metadata_path)

    ##print images
    image_descs = []
    load_false = 0
    load_true = 0
    for i, (image_name, murl, image_desc) in enumerate(tqdm(image_urls)):
        try:
            #req = urllib2.Request(murl, headers={'User-Agent' : header})
            #raw_img = urllib2.urlopen(req).read()
            #req = urllib.request.Request(turl, headers={'User-Agent' : header})
#             with contextlib.closing(urllib.urlopen(murl).read()) as x:
#                 print(x)
            raw_img_obj = urllib.request.urlopen(murl)
            raw_img = x.read()

            filename = 'bing-' + image_name[:60].lower().replace('.jpg', '').replace('.jpeg', '') + '.jpg'
            f = open(os.path.join(image_path, filename), 'wb')
            f.write(raw_img)
            f.close()
            image_descs.append((image_name, image_desc))
            load_true += 1
        except Exception as e:
            load_false += 1
    
    print(f'Successfully loaded {load_true} images; failed to load {load_false} images')
    np.save(metadata_path/f'bing-{label}', image_desc)

# test function
# save_bing_images('blackwalnut', blackwalnut, repo_path)

In [48]:
def save_bing_images(label, image_urls, path):

    image_path = path / 'data' / 'images' / label
    metadata_path = path / 'data' / 'metadata'

    if not os.path.exists(image_path):
        os.mkdir(image_path)
    if not os.path.exists(metadata_path):
        os.mkdir(metadata_path)

    ##print images
    image_descs = []
    load_false = 0
    load_true = 0
    for i, (image_name, murl, image_desc) in enumerate(tqdm(image_urls)):
        
        try:
            x = urllib.request.urlopen(murl)
        except Exception as e:
            x.close()
            load_false += 1
            continue
        else:
            raw_img = x.read()
            filename = 'bing-' + image_name[:60].lower().replace('.jpg', '').replace('.jpeg', '') + '.jpg'

            with open(os.path.join(image_path, filename), 'wb') as f:
                f.write(raw_img)
                f.close()

            image_descs.append((image_name, image_desc))
            load_true += 1
            x.close()
    
    print(f'Successfully loaded {load_true} images; failed to load {load_false} images')
    np.save(metadata_path/f'bing-{label}', image_desc)

# test function
# save_bing_images('blackwalnut', blackwalnut, repo_path)

In [None]:
# pull data for all trees
trees = ['black locust', 'black walnut', 'honey locust']
trees_pt2 = ['loblolly pine', 'northern red oak', 'pecan']
trees_pt3 = ['chinese chestnut']
NUM_PAGES = 5
PER_PAGE = 60

for t in trees_pt3:
    print(f'==== Scraping {t} ====')
    image_urls = get_bing_image_urls(t+' trees', num_pages=NUM_PAGES, per_page=PER_PAGE)
    save_bing_images(label=t.replace(' ', ''), image_urls=image_urls, path=repo_path)


==== Scraping chinese chestnut ====
pulled 300 image urls
---- Downloading ----


 24%|██████████                                | 72/300 [00:18<01:54,  1.99it/s]