In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
from bs4 import BeautifulSoup
import urllib
from pathlib import Path
import os
import shutil
import sys
import json
from tqdm import tqdm

# constants
repo_path = Path('/Users/etriesch/dev/tree-finder')
species_path = repo_path / 'data/images_spec'
binary_path = repo_path / 'data/images_bin'

In [14]:
# setup cell
from scrape_fns import make_scrape_profile
from scrape_fns import download_arbor_tree_images, download_harvard_tree_images
from scrape_fns import download_bing_images

# And change jupyter settings to auto-reload these functions before each instance running them
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
make_scrape_profile()

In [4]:
# constants
trees = np.array(['black locust', 'black walnut', 'honey locust', 'loblolly pine', 'northern red oak', 'pecan', 'chinese chestnut'])

# Pull species-labeled trees

## Arborday website
https://shop.arborday.org/

In [4]:
tree_in_db = [False, True, False, True, True, True, True]

download_arbor_tree_images(species_path, trees[tree_in_db])

## Harvard Arboretum
http://arboretum.harvard.edu/

In [13]:
# constants
search_trees = np.array(['black locust', 'black walnut', 'honey locust', 'loblolly pine', 'northern red oak', 'pecan', 'castanea'])
tree_in_db = [True, True, False, False, True, True, True]

download_harvard_tree_images(species_path, trees[tree_in_db], search_trees[tree_in_db])

pulling 43 black locusts from http://arboretum.harvard.edu/plants/image-search/?keyword=black+locust&image_per_page=1000
pulling 84 black walnuts from http://arboretum.harvard.edu/plants/image-search/?keyword=black+walnut&image_per_page=1000
pulling 41 northern red oaks from http://arboretum.harvard.edu/plants/image-search/?keyword=northern+red+oak&image_per_page=1000
pulling 14 pecans from http://arboretum.harvard.edu/plants/image-search/?keyword=pecan&image_per_page=1000
pulling 96 chinese chestnuts from http://arboretum.harvard.edu/plants/image-search/?keyword=castanea&image_per_page=1000


## Bing search images

In [None]:
tree_search_terms = np.array([t + ' tree' for t in trees])

download_bing_images(species_path, tree_search_terms, num_pages=10, photos_per_page=75)

# Pull binary-classified trees

In [33]:
search_terms = np.array([
    'tree photograph',
    'table', 'map',
    'cartoon tree drawing',
    'diagram', 'hand'
])

download_bing_images(binary_path, search_terms, num_pages=1, photos_per_page=75)

==== Scraping tree photograph ====
pulled 75 image urls
---- Downloading ----


100%|███████████████████████████████████████████| 75/75 [01:03<00:00,  1.18it/s]


Successfully loaded 70 images
==== Scraping table ====
pulled 75 image urls
---- Downloading ----


100%|███████████████████████████████████████████| 75/75 [00:52<00:00,  1.44it/s]


Successfully loaded 69 images
==== Scraping map ====
pulled 75 image urls
---- Downloading ----


100%|███████████████████████████████████████████| 75/75 [00:52<00:00,  1.44it/s]


Successfully loaded 65 images
==== Scraping cartoon tree drawing ====
pulled 75 image urls
---- Downloading ----


100%|███████████████████████████████████████████| 75/75 [00:44<00:00,  1.69it/s]


Successfully loaded 69 images
==== Scraping diagram ====
pulled 75 image urls
---- Downloading ----


100%|███████████████████████████████████████████| 75/75 [00:41<00:00,  1.79it/s]

Successfully loaded 70 images





In [34]:
download_bing_images(binary_path, np.array(['hand']), num_pages=1, photos_per_page=75)

==== Scraping hand ====
pulled 75 image urls
---- Downloading ----


100%|███████████████████████████████████████████| 75/75 [01:00<00:00,  1.23it/s]

Successfully loaded 66 images





### Move around images to make binary classification structure

In [38]:
# move photos around so its a binary classification -- make sure tree class is first in list
# make tree folder
source_dir = binary_path / search_terms[0].replace(' ', '')
target_dir = binary_path / 'tree'
if os.path.exists(source_dir):
    os.rename(source_dir, target_dir)

In [50]:
# make not-tree folder
target_dir = binary_path / 'nottree'
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

for search in search_terms[1:]:
    source_dir = binary_path / search.replace(' ', '')
    file_names = os.listdir(source_dir) # get files in folder
    for file_name in file_names: # move files
        shutil.copy(source_dir / file_name, target_dir)
    if os.path.exists(source_dir):
        shutil.rmtree(source_dir) # delete folder

table
map
cartoon tree drawing
diagram
