## Read, filter and clean Camden dataset

In [None]:
import os
import pandas as pd
import cv2
from PIL import Image as PImage

In [None]:
camden_trees = '../Trees_In_Camden.csv'

In [None]:
df = pd.read_csv(camden_trees) 

#### Remove 'Unknown' or 'Vacant pit' tree sites from data

In [None]:
df = df[df.apply(lambda x: ('vacant' not in str(x['Common Name']).lower() and
                            'unknown' not in str(x['Common Name']).lower() and
                            'remove' not in str(x['Common Name']).lower() and
                            x['Number Of Trees'] > 0), axis=1)]

#### Remove trees with no location

In [None]:
df = df[~df['Location'].isnull()]

#### Summary statistics of Species and Maturity

In [None]:
maturity_count = df.groupby('Maturity').count()[['Number Of Trees']].sort_values(
    'Number Of Trees', axis=0, ascending=False, inplace=False, kind='quicksort',
    na_position='last', ignore_index=False, key=None)
maturity_count

In [None]:
common_count = df.groupby('Common Name').count()[['Number Of Trees']].sort_values(
    'Number Of Trees', axis=0, ascending=False, inplace=False, kind='quicksort',
    na_position='last', ignore_index=False, key=None)
common_count[:15]

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
common_count.hist(ax=ax, bins=120, bottom=0.1)
ax.set_yscale('log')
ax.set_xlabel('Number of species examples')
ax.set_ylabel('Count of species')
ax.set_title('Camden tree species distribution', fontweight='bold')

In [None]:
common_count.to_csv('species_count.csv')
len(common_count[(common_count['Number Of Trees'] >= 50)])

In [None]:
# Only use species with 50 or more trees
chosen_species = common_count[(common_count['Number Of Trees'] >= 50)]

In [None]:
final_df = df[df.apply(lambda x: x['Common Name'] in chosen_species.index, axis=1)]
final_df.to_csv('camden_clean.csv')

## Download image from Google for each tree in cleaned dataset

In [None]:
import os
from os import path
import urllib.request
from time import sleep

if not os.path.exists('../camden_images'):
    os.makedirs('../camden_images')

#### Set URL Parameters needed for retrieving Google Images

In [None]:
URL = 'https://maps.googleapis.com/maps/api/staticmap?zoom=20&size=200x200&maptype=satellite&center='
API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

In [None]:
for index, row in final_df.iterrows():
    
    # Does folder exsit with species name? if not, create one
    species_path = '../camden_images/' + row['Common Name']
    if not os.path.exists(species_path):
        os.makedirs(species_path)
    
    # Does image exist in folder with species name that contains the identifier above?
    tree_image = species_path + '/' + 'tree_aerial_' + row['Identifier'] + '.png'
    if not os.path.isfile(tree_image):
        print('Now getting image for ' + row['Identifier'] + ' ' + row['Common Name'])
        # If not, download from Google
        centre = row['Location'].replace("(" ,"").replace(")" ,"").replace(" ", "")
        google_url = URL + centre + API_KEY
        urllib.request.urlretrieve(google_url, tree_image)
        sleep(0.2)

## Remove Logo

In [None]:
camden_path = '../camden_images/'
img_paths = []
for subdir, dirs, files in os.walk(camden_path):
    for file in files:
        path = subdir + '/' + file
        if '.png' in path:
            img_paths.append(path)

In [None]:
for img_path in img_paths:
    img = PImage.open(img_path)
    width, height = img.size 
  
    # Setting the points for cropped image 
    left = 0
    top = 0
    right = 200
    bottom = 175

    # Cropped image of above dimension
    img = img.crop((left, top, right, bottom))
    img.save(img_path, 'png')