In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import sys
import re
from bs4 import BeautifulSoup
import urllib
from pathlib import Path
import os
import shutil
import sys
import json
from tqdm import tqdm
import torch

In [4]:
# Remote instance set-up cell
USE_GPU = True
dtype = torch.float32
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('using device:', device)

using device: cpu


In [7]:
# directory setup cell
repo = 'tree-finder/'
location = 'google'

# Local paths
local_root = '/Users/etriesch/dev/'
colab_root = '/content/drive/My Drive/git/'
aws_root = '/home/ec2-user/'

# define root
if location == 'local':
  root = local_root
elif location == 'google':
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  root = colab_root
elif location == 'aws':
  root = aws_root

repo_path = root + repo
model_path = repo_path + 'models/'
data_path = repo_path + 'data/'
sys.path.append(repo_path)

Mounted at /content/drive


In [8]:
# setup cell
from notebooks.scrape_fns import make_scrape_profile
from notebooks.scrape_fns import download_arbor_tree_images, download_harvard_tree_images
from notebooks.scrape_fns import download_bing_images

# And change jupyter settings to auto-reload these functions before each instance running them
%load_ext autoreload
%autoreload 2

In [9]:
make_scrape_profile()
species_path = data_path + 'images_spec'
binary_path = data_path + 'images_bin'

In [None]:
# constants
trees = np.array(['black locust', 'black walnut', 'honey locust', 'loblolly pine', 'northern red oak', 'pecan', 'chinese chestnut'])

# Pull species-labeled trees

## Arborday website
https://shop.arborday.org/

In [None]:
tree_in_db = [False, True, False, True, True, True, True]

download_arbor_tree_images(species_path, trees[tree_in_db])

## Harvard Arboretum
http://arboretum.harvard.edu/

In [None]:
# constants
search_trees = np.array(['black locust', 'black walnut', 'honey locust', 'loblolly pine', 'northern red oak', 'pecan', 'castanea'])
tree_in_db = [True, True, False, False, True, True, True]

download_harvard_tree_images(species_path, trees[tree_in_db], search_trees[tree_in_db])

pulling 43 black locusts from http://arboretum.harvard.edu/plants/image-search/?keyword=black+locust&image_per_page=1000
pulling 84 black walnuts from http://arboretum.harvard.edu/plants/image-search/?keyword=black+walnut&image_per_page=1000
pulling 41 northern red oaks from http://arboretum.harvard.edu/plants/image-search/?keyword=northern+red+oak&image_per_page=1000
pulling 14 pecans from http://arboretum.harvard.edu/plants/image-search/?keyword=pecan&image_per_page=1000
pulling 96 chinese chestnuts from http://arboretum.harvard.edu/plants/image-search/?keyword=castanea&image_per_page=1000


## Bing search images

In [None]:
tree_search_terms = np.array([t + ' tree' for t in trees])

download_bing_images(species_path, tree_search_terms, num_pages=10, photos_per_page=75)

# Pull binary-classified trees

In [None]:
tree_search_terms = np.array([
    'tree photograph',
    'leaves tree photograph',
    'bark tree photograph'
])

download_bing_images(binary_path, tree_search_terms, num_pages=4, photos_per_page=75)

==== Scraping tree photograph ====
pulled 300 image urls
---- Downloading ----


100%|██████████| 300/300 [02:35<00:00,  1.93it/s]


Successfully loaded 275 images
==== Scraping leaves tree photograph ====
pulled 300 image urls
---- Downloading ----


100%|██████████| 300/300 [02:42<00:00,  1.84it/s]


Successfully loaded 272 images
==== Scraping bark tree photograph ====
pulled 300 image urls
---- Downloading ----


100%|██████████| 300/300 [02:14<00:00,  2.23it/s]

Successfully loaded 282 images





In [None]:
nottree_search_terms = np.array([
    'table', 'map',
    'cartoon tree drawing',
    'diagram', 'hand'
])

download_bing_images(binary_path, nottree_search_terms, num_pages=2, photos_per_page=75)

==== Scraping table ====
pulled 150 image urls
---- Downloading ----


100%|██████████| 150/150 [01:48<00:00,  1.38it/s]


Successfully loaded 140 images
==== Scraping map ====
pulled 150 image urls
---- Downloading ----


100%|██████████| 150/150 [01:39<00:00,  1.50it/s]


Successfully loaded 129 images
==== Scraping cartoon tree drawing ====
pulled 150 image urls
---- Downloading ----


100%|██████████| 150/150 [01:19<00:00,  1.90it/s]


Successfully loaded 141 images
==== Scraping diagram ====
pulled 150 image urls
---- Downloading ----


100%|██████████| 150/150 [00:51<00:00,  2.94it/s]


Successfully loaded 141 images
==== Scraping hand ====
pulled 150 image urls
---- Downloading ----


100%|██████████| 150/150 [01:15<00:00,  1.99it/s]

Successfully loaded 130 images





### Move around images to make binary classification structure

In [None]:
# make tree folder
target_dir = binary_path / 'tree'
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

for search in tree_search_terms:
    source_dir = binary_path / search.replace(' ', '')
    file_names = os.listdir(source_dir) # get files in folder
    for file_name in file_names: # move files
        shutil.copy(source_dir / file_name, target_dir)
    if os.path.exists(source_dir):
        shutil.rmtree(source_dir) # delete folder

In [None]:
# make not-tree folder
target_dir = binary_path / 'nottree'
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

for search in nottree_search_terms:
    source_dir = binary_path / search.replace(' ', '')
    file_names = os.listdir(source_dir) # get files in folder
    for file_name in file_names: # move files
        shutil.copy(source_dir / file_name, target_dir)
    if os.path.exists(source_dir):
        shutil.rmtree(source_dir) # delete folder