# OneForest 

We introduce the package OneForest an end-to-end method to create a neural network able to predict tree charcteristics from the sing image of the tree canopy. To train the network, we need a large tree dataset. However, there is no existing high-resolution labeled dataset on forests. 

We developed OneForest, a solution to leverage our understanding of trees by smartly combining ground and drone data. The main challenge is to properly transfer information from a small dataset of groundtruth observations to a large set of detected trees on drone images. Using different mapping methods robust to GPS noise in ground data and inexact tree detection, we fuse citizen science and remote sensing data. We use the resulting dataset to train a Convolutional Neural Network to predict tree characteristics (such as tree species) based on the drone image of the tree canopy.

This Jupyter notebook presents the three steps of OneForest:
- Data pre-processing: preparation of ground and drone data (tree detection, features,...)
- Mapping drone and ground data with different methods: Nearest Neighbours, Graph Matching Networks, Optimal Transport, Gromov-Wasserstein
- Species Classification by a Neural newtrok trained on the resulting tree datasets obtained after the mapping step.

We work on two datasets:
- Ecuador dataset
- NEON dataset


## Load packages and modules

In [None]:
!pip install -Ur ../requirements.txt

In [None]:
import os
import matplotlib.pylab as plt
import seaborn as sns

colors = sns.color_palette('tab10')
mypalette={'NN':colors[0], 'GMN':colors[4], 'OT':colors[1], 'OT on GPS position':colors[1], 'GW':colors[2], 'OT on GPS position + Tree species':colors[3]}
import numpy as np
import pandas as pd
import rasterio
from rasterio.plot import reshape_as_image
import PIL
PIL.Image.MAX_IMAGE_PIXELS = None
from PIL import Image
import tensorflow
import sys
package = os.path.dirname(os.getcwd())
sys.path.append(package)
sys.path.append(package + '/utils')
sys.path

In [None]:
from utils.extract_features import *
from utils.deepforest_detection import *
from utils.visualisation import *
from utils.plot_folium import *
from utils.plot_density import *
from utils.mapping import *

## Ecuador Dataset

### 1. Data Preparation

#### Orthomosaics RGB Features

- Extract fetaures for each Orthomosaic RGB
- Split the Orthomosaics into 400x400 tiles

Return the dataframe ortho_data.csv that contains all important information (position) on orthomosaics RGB: minimimal and maximal latititude, longitude, width and height (pixels).

In [None]:
# Final Plot Information

shapefile = gpd.read_file("Ecuador/wwf_ecuador/Merged_final_plots/Merged_final_plots.shp")
merged_final = pd.DataFrame(shapefile)

In [None]:
merged_final.Name

In [None]:
print(merged_final.Name.iloc[4])
merged_final.geometry.iloc[4]


In [None]:
tfw_file = open("Ecuador/wwf_ecuador/RGB Orthomosaics/Flora Pluas RGB.tfw", "r")
tfw_raw_data = tfw_file.read()  #reading all text from file
tfw_raw_data.split("\n")
# [ratio_x, , , -ratio_y, lon_min, lat_max, ]

In [None]:
# Open image file for reading (binary mode)
path_to_raster = os.path.join('Ecuador/wwf_ecuador/RGB Orthomosaics/Flora Pluas RGB.tif')
f = open(path_to_raster, 'rb')

# Return Exif tags
tags = exifread.process_file(f)
# Print the tag/ value pairs
for tag in tags.keys():
    print("Key: %s, value %s" % (tag, tags[tag]))

In [None]:
directory = "Ecuador/wwf_ecuador/RGB Orthomosaics"
ortho_features = read_orthomosaics(directory)

In [None]:
def myround(x, base=4000):
    return base * (int(x/base)+1)

def clear(mydir):
    for f in os.listdir(mydir):
        os.remove(os.path.join(mydir, f))

In [None]:
ortho_dim = []
for file in os.listdir('Ecuador/wwf_ecuador/RGB Orthomosaics'):
    if file.endswith('.tif'):
        # Open image file for reading (binary mode)
        path_to_raster = os.path.join('Ecuador/wwf_ecuador/RGB Orthomosaics', file)
        f = open(path_to_raster, 'rb')

        # Return Exif tags
        tags = exifread.process_file(f)
        width = int(str(tags['Image ImageWidth']))
        height = int(str(tags['Image ImageLength']))
        name = file.replace('.tif','')
        ortho_dim.append([name, width, height])

        """
        # Padding (optional) for a better split

        img = cv2.imread(path_to_raster)
        img_padded = cv2.copyMakeBorder(img, 0, myround(height)-height, 0, myround(width)-width, cv2.BORDER_CONSTANT, value=0)
        cv2.imwrite('wwf_ecuador/RGB Orthomosaics/{}_padded.tif'.format(name), img_padded)

        plt.imshow(cv2.cvtColor(img_padded, cv2.COLOR_BGR2RGB))
        """
        tiles_dir = "Ecuador/images/%s"%name
        if not os.path.exists(tiles_dir):
            os.makedirs(tiles_dir)
        clear(tiles_dir)
        split_raster(path_to_raster, base_dir=tiles_dir, patch_size=4000, patch_overlap=0.05)

ortho_dim = pd.DataFrame(data = ortho_dim, columns=['name', 'width', 'height'])   

In [None]:
ortho_data = pd.merge(ortho_features, ortho_dim, on = 'name')
ortho_data['ratio_x_init'] = ortho_data.apply(lambda x: ratio(x.width, x.lon_min, x.lon_max), axis=1)
ortho_data['ratio_y_init'] = ortho_data.apply(lambda x: ratio(x.height, x.lat_min, x.lat_max), axis=1)
ortho_data.to_csv('Ecuador/features/ortho_data.csv', index = False)
ortho_data

In [None]:
ortho_data = pd.read_csv('Ecuador/features/ortho_data.csv', index_col = None)

list_sites = ['Carlos Vera Arteaga RGB', 'Carlos Vera Guevara RGB', 'Leonor Aspiazu RGB', 
             'Manuel Macias RGB', 'Nestor Macias RGB', 'Flora Pluas RGB']

for site_name in list_sites:
    # Compute Scale
    # Bounds Drone
    site = ortho_data[ortho_data.name == site_name]
    max_lat = site['lat_max'].values[0]
    min_lat = site['lat_min'].values[0]
    max_lon = site['lon_max'].values[0]
    min_lon = site['lon_min'].values[0]
    bounds_drone = [min_lon, min_lat, max_lon, max_lat]

    # Bounds Ground
    ground_files = pd.read_csv('Ecuador/features/initial_ground_data.csv')
    ground_files_site = ground_files.loc[(min_lat < ground_files.lat) 
                                    & (ground_files.lat < max_lat) 
                                    & (min_lon < ground_files.lon)
                                   & (ground_files.lon < max_lon)]
    bounds_ground = get_bounds(ground_files_site)

    scale = get_scale(bounds_drone, bounds_ground)

    # Store scale for each site in ortho_data
    ortho_data.loc[ortho_data.name == site_name, 'scale_lat'] = scale[0]
    ortho_data.loc[ortho_data.name == site_name, 'scale_lon'] = scale[1]
    
    # Store scale for each site in ortho_data
    ortho_data.loc[ortho_data.name == site_name, 'center_lat'] = (min_lat + max_lat) * .5
    ortho_data.loc[ortho_data.name == site_name, 'center_lon'] = (min_lon + max_lon) * .5

# Redefine ratios
ortho_data['ratio_y'] = ortho_data['ratio_y_init']/ortho_data['scale_lat']
ortho_data['ratio_x'] = ortho_data['ratio_x_init']/ortho_data['scale_lon']

# Rename
ortho_data = ortho_data.rename(columns={'lat_min': 'lat_min_init', 'lat_max': 'lat_max_init', 'lon_min': 'lon_min_init', 'lon_max': 'lon_max_init'})

# Rescale drone bounds
ortho_data['lat_min'] = (ortho_data['lat_min_init'] - ortho_data['center_lat'])/ortho_data['scale_lat'] + ortho_data['center_lat']
ortho_data['lat_max'] = (ortho_data['lat_max_init'] - ortho_data['center_lat'])/ortho_data['scale_lat'] + ortho_data['center_lat']
ortho_data['lon_min'] = (ortho_data['lon_min_init'] - ortho_data['center_lon'])/ortho_data['scale_lon'] + ortho_data['center_lon']
ortho_data['lon_max'] = (ortho_data['lon_max_init'] - ortho_data['center_lon'])/ortho_data['scale_lon'] + ortho_data['center_lon']


ortho_data.to_csv('Ecuador/features/ortho_data.csv', index = False)
ortho_data
        

#### DeepForest Tree Detection

Return annotations by DeepForest for each site in Ecuador.

In [None]:
# Load model
model = deepforest.deepforest(saved_model = os.getcwd()+'/Ecuador/deepforest/final_model_4000_epochs_35.h5')

In [None]:
# Split images and predict bounding box - return annotation files
column_names = ['img_path', 'xmin', 'ymin', 'xmax', 'ymax', 'score']

dir = os.getcwd()
for folder in os.listdir("Ecuador/images"):
    if not folder.startswith('.'):
        annotations_files = pd.DataFrame(columns = column_names)
        for file in os.listdir("Ecuador/images/"+folder):
            if not file.startswith('.'):
                tile_annotations = get_annotations(dir+"/images/"+folder+"/"+file, model)
                annotations_files = pd.concat([annotations_files, tile_annotations])
        annotations_files = annotations_files.reset_index(drop=True)
        file_path = 'Ecuador/annotations/{}_annotations.csv'.format(folder)
        annotations_files.to_csv(file_path, index=False)
        print('DeepForest Annotations are saved for site {}'.format(folder))


#### Process Data

For each tree annotation by DeepForest on drone images, we add features: tree location, tile position, site, probability of the tree to belong to the dominant species "Musacea".

In [None]:
# Get the CNN for initial tree species prediction for Ecuador dataset
cnn_model = tensorflow.keras.models.load_model('Ecuador/cnn/cnn_model')

In [None]:
ortho_data = pd.read_csv('features/ortho_data.csv', index_col = None)

list_sites = ['Carlos Vera Arteaga RGB', 'Carlos Vera Guevara RGB', 'Leonor Aspiazu RGB', 
             'Manuel Macias RGB', 'Nestor Macias RGB', 'Flora Pluas RGB']

for site_name in list_sites:
    for file in os.listdir("Ecuador/annotations"):
        if (file == '{}_annotations.csv'.format(site_name)):
            file_path = 'Ecuador/annotations/%s'%file
            df = pd.read_csv(file_path)


            df['img_name'], df['tile_index'], df['tile_xmin'], df['tile_ymin'], df['tile_xmax'], df['tile_ymax'] = zip(*df['img_path'].map(expand_tile_features))
            df[['x', 'y']] = df.apply(lambda x: [get_center(x.xmin,x.xmax), get_center(x.ymin,x.ymax)], axis=1, result_type="expand")

            df['Xmin'] = df.xmin + df.tile_xmin
            df['Ymin'] = df.ymin + df.tile_ymin
            df['Xmax'] = df.xmax + df.tile_xmin
            df['Ymax'] = df.ymax + df.tile_ymin
            df['X'] = df.x + df.tile_xmin
            df['Y'] = df.y + df.tile_ymin

            df[['lon', 'lat']] = df.apply(lambda x: convert_xy_tile_to_lonlat(x.img_name, x.tile_xmin, x.tile_ymin, x.x, x.y, ortho_data), axis=1, result_type="expand")
            df.to_csv('Ecuador/annotations/{}_processed.csv'.format(file.replace('.csv','')), index = False)
            df = predict_musacea(df, site_name, cnn_model)
            df.to_csv('Ecuador/annotations/{}_processed_cnn.csv'.format(file.replace('.csv','')), index = False)
            print('Site {} is done'.format(site_name))
        

In [None]:
# Merge annotations files of each site in a unique annotations file
final_annotations = pd.DataFrame()

for file in os.listdir("Ecuador/annotations"):
    if file.endswith('cnn.csv'):
        file_path = 'Ecuador/annotations/%s'%file
        df = pd.read_csv(file_path)
        final_annotations = pd.concat([final_annotations, df])
final_annotations.to_csv('Ecuador/annotations/final_annotations.csv')

#### Visualize DeepForest Predictions

In [None]:
site_name = input('Enter the site name to explore: ')

In [None]:
# Draw bounding boxes on images
path_to_img = 'Ecuador/images/{}'.format(site_name)
path_to_annot = 'Ecuador/annotations/{}_annotations_processed.csv'.format(site_name)
annot = pd.read_csv(path_to_annot)

for tile in os.listdir(path_to_img):
    img_boxes = annot.loc[annot.img_path == tile]
    if len(img_boxes)>0:
        #predict image
        boxes = img_boxes[["xmin", "ymin", "xmax", "ymax"]].to_numpy()
        deepforest_annotations = box_to_annotation(boxes)

        path = os.path.join(path_to_img, tile)
        plt.figure(figsize=(15,15))
        im = cv2.imread(path)
        im = draw_annotations(im, deepforest_annotations, color=(255, 0, 0), label_to_name=None, show_caption = True, cv2_authorized = False, thickness = 5)
        plt.show()
    break


#### Ground data

In [None]:
# Extract citizen science
# For each tree (reported), we have GPS coordinates, species, size,..

shapefile = gpd.read_file("Ecuador/wwf_ecuador/Final_Trees/Final_Trees.shp")
ground_files = pd.DataFrame(shapefile)
ground_files.columns

In [None]:
# We keep as features for each Tree: 
# Name: 'Variedad_1', 
# Lat:'_Gps_1_lat', 
# Lon:'_Gps_1_lon', 
# Average Diameter: 'Ave_Diamet', 
# Height of Tree: 'Altura_del', 
# Year when the tree was planted: 'Plant_Yr'
# Plot identifier: Plot
# Index of Tree: _index
column_names = ["name",  "lat", "lon", "diameter", 'height', 'year', 'plot_id', 'tree_id']
ground_files.rename(columns={'Variedad_1' : "name",  '_Gps_1_lat' : "lat", '_Gps_1_lon' : "lon",  
                   'Ave_Diamet': "diameter", 'Altura_del':'height', 'Plant_Yr':'year', 'Plot':'plot_id', '_index': 'tree_id'},inplace=True)
ground_files = ground_files[column_names]
ground_files['is_musacea'] = ground_files.name.apply(lambda x: 0 if x == 'Musacea' else 1)

In [None]:
list_sites = ['Carlos Vera Arteaga RGB', 'Carlos Vera Guevara RGB', 'Leonor Aspiazu RGB', 
             'Manuel Macias RGB', 'Nestor Macias RGB', 'Flora Pluas RGB']

ortho_data = pd.read_csv('Ecuador/features/ortho_data.csv', index_col = None)
    
for site_name in list_sites:
    site = ortho_data[ortho_data.name == site_name]

    d = 0.001
    max_lat = site['lat_max'].values[0]
    min_lat = site['lat_min'].values[0]
    max_lon = site['lon_max'].values[0]
    min_lon = site['lon_min'].values[0]
    
    ground_files_site = ground_files.loc[(min_lat-d < ground_files.lat) 
                                & (ground_files.lat < max_lat+d) 
                                & (min_lon-d < ground_files.lon)
                               & (ground_files.lon < max_lon+d)]
    
    ground_files_site = ground_files_site.reset_index(drop = True)
    ground_files_site['site'] = site_name
    pos_xy = ground_files_site.apply(lambda x: convert_lonlat_to_xy(site_name, x.lon, x.lat, ortho_data), axis = 1).to_numpy()
    ground_files_site[['X', 'Y']] = pd.DataFrame(pos_xy.tolist())
    
    ground_files_site.to_csv('Ecuador/features/ground_data_{}.csv'.format(site_name), index = False)

In [None]:
# Merge annotations files of each site in a unique annotations file
ground_data = pd.DataFrame()

for file in os.listdir("Ecuador/features"):
    if file.startswith('ground'):
        file_path = 'Ecuador/features/%s'%file
        df = pd.read_csv(file_path)
        ground_data = pd.concat([ground_data, df])
ground_data = ground_data.reset_index(drop = True)
ground_data.to_csv('Ecuador/features/final_ground_data.csv')

#### Visualize all sites 

In [None]:
final_annotations = pd.read_csv('Ecuador/annotations/final_annotations.csv', index_col = 0)
final_ground_data = pd.read_csv('Ecuador/features/final_ground_data.csv', index_col = 0)

In [None]:
X_drone = final_annotations[['lat', 'lon']].to_numpy()
X_ground = final_ground_data[['lat', 'lon']].to_numpy()

In [None]:
max_lat = final_annotations.lat.max()
min_lat = final_annotations.lat.min()

max_lon = final_annotations.lon.max()
min_lon = final_annotations.lon.min()

ECUADOR_COORDINATES = [(min_lat + max_lat)/2, (min_lon + max_lon)/2]

In [None]:
plot_initial(ECUADOR_COORDINATES, X_drone, X_ground, final_ground_data, satellite = True)

### 2. Mapping 

In [None]:
ortho_data = pd.read_csv('Ecuador/features/ortho_data.csv', index_col = None)

In [None]:
list_sites = ['Carlos Vera Arteaga RGB', 'Carlos Vera Guevara RGB', 'Leonor Aspiazu RGB', 
             'Manuel Macias RGB', 'Nestor Macias RGB', 'Flora Pluas RGB']

In [None]:
# For visualization of the mapping, choose a site and a tile
site_name = input('Select a site: ') #Carlos Vera Guevara RGB
tile = input('Select a tile for visualisation: ') #Carlos Vera Guevara RGB_5_3800_3800_7800_7800.png

In [None]:
methods = ['Nearest Neighbours', 'Optimal Transport Non-Greedy', 
           'Optimal Transport Greedy', 'Optimal Transport with CNN Non-Greedy',
          'Gromov-Wasserstein']

for i in range(len(methods)):

    if methods[i]=='Nearest Neighbours':
        final = get_matching_baseline_Ecuador(list_sites)
    if methods[i]=='Optimal Transport Non-Greedy':
        G = get_map_Ecuador(list_sites, method = 'OT')
        final = get_matching_Ecuador(list_sites, G, greedy = False, drone_to_ground = True)
    if methods[i]=='Optimal Transport Greedy':
        G = get_map_Ecuador(list_sites, method = 'OT')
        final = get_matching_Ecuador(list_sites, G, greedy = True)
    if methods[i]=='Optimal Transport with CNN Non-Greedy':
        G = get_map_Ecuador(list_sites, method = 'OT + CNN', mu = 1)
        final = get_matching_Ecuador(list_sites, G, greedy = True)
    if methods[i]=='Gromov-Wasserstein':
        G = get_map_Ecuador(list_sites, method = 'GW')
        final = get_matching_Ecuador(list_sites, G, greedy = True)

    final.to_csv('Ecuador/results/{}_final_matching.csv'.format(methods[i]))

In [None]:
# Visualisation of Nearest Neighbours mapping (example)
method = 'Nearest Neighbours'
final = pd.read_csv('Ecuador/results/{}_final_matching.csv'.format(method))

params = {'Params': 'Position', 'Type': 'Baseline', 'Musacea Accuracy': acc_musacea(final), 'Merge': method}
annotations_files = pd.read_csv('Ecuador/annotations/{}_annotations_processed_cnn.csv'.format(site_name))
visualize_tile_prediction(site_name, tile, final, annotations_files, ortho_data, params)

# Density plot
d = density_plots('Method: {}'.format(method), final)
d.savefig('Ecuador/results/{}_density.png'.format(method), bbox_inches='tight', pad_inches=0)

#### Visualisation for WWF

In [None]:
import matplotlib.image as mpimg
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
final = pd.read_csv('Ecuador/results/Optimal Transport with CNN Greedy_final_matching.csv', index_col = 0)

In [None]:
unique = final["name"].unique()
palette = dict(zip(unique, sns.color_palette(n_colors=len(unique))))

In [None]:
final.columns

#### Plot ground observations (before OT mapping) VS final tree dataset (after OT mapping)

In [None]:
# Before OT mapping - Ground measurements

final_ground_data = pd.read_csv('Ecuador/features/final_ground_data.csv', index_col = 0)
df_ground_site = final_ground_data[final_ground_data.site == site_name]
    
img_path = 'Ecuador/wwf_ecuador/RGB Orthomosaics/{}.tif'.format(site_name)

fig, ax = plt.subplots(1, figsize = (15,15))
im = mpimg.imread(img_path)

ax = sns.scatterplot(x="X", y="Y", data=df_ground_site, s = 50, hue="name", edgecolor='black',linewidth=1, palette = palette)
plt.axis('off')
ax.imshow(im)
plt.legend(loc="upper right", borderaxespad=0., fontsize=15, markerscale=1.5)
#plt.title('Species Prediction with Optimal Transport', fontsize=16, y = 1.05)
plt.savefig('Ecuador/results/fig/ground_measurements/species_observations_{}.png'.format(site_name))
plt.show()


df_ground_site['diameter'] = df_ground_site['diameter'].replace(0.0, np.nan)
df_ground_site['diameter'].values[df_ground_site['diameter'].values > 50] = np.nan

fig, ax = plt.subplots(1, figsize = (15,15))
im = mpimg.imread(img_path)
ax.imshow(im)
plt.axis('off')
#ax.scatter(x=df_site['X_d'],y=df_site['Y_d'],c='black',s = 10)
img = ax.scatter(x=df_ground_site['X'],y=df_ground_site['Y'],c=df_ground_site['diameter'],cmap='copper', s = 60)
#plt.title('Diameter Prediction with Optimal Transport', fontsize=16, y = 1.05)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.05)
plt.colorbar(img, cax=cax)
plt.savefig('Ecuador/results/fig/ground_measurements/diameter_observations_{}.png'.format(site_name))
plt.show()

In [None]:
site_name = 'Manuel Macias RGB'


# After OT mapping
df_site = final[final.img_name == site_name]
    
img_path = 'Ecuador/wwf_ecuador/RGB Orthomosaics/{}.tif'.format(site_name)

fig, ax = plt.subplots(1, figsize = (15,15))
im = mpimg.imread(img_path)

ax = sns.scatterplot(x="X_d", y="Y_d", data=df_site, s = 50, hue="name", edgecolor='black',linewidth=1, palette = palette)
plt.axis('off')
ax.imshow(im)
plt.legend(loc="upper right", borderaxespad=0., fontsize=15, markerscale=1.5)
#plt.title('Species Prediction with Optimal Transport', fontsize=16, y = 1.05)
plt.savefig('Ecuador/results/fig/OT_CNN-filtering_matching/OT_CNN-filtering_species_mapping_{}.png'.format(site_name))
plt.show()


df_site['diameter'] = df_site['diameter'].replace(0.0, np.nan)
df_site['diameter'].values[df_site['diameter'].values > 50] = np.nan

fig, ax = plt.subplots(1, figsize = (15,15))
im = mpimg.imread(img_path)
ax.imshow(im)
plt.axis('off')
#ax.scatter(x=df_site['X_d'],y=df_site['Y_d'],c='black',s = 10)
img = ax.scatter(x=df_site['X_d'],y=df_site['Y_d'],c=df_site['diameter'],cmap='copper', s = 60)
#plt.title('Diameter Prediction with Optimal Transport', fontsize=16, y = 1.05)
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.05)
plt.colorbar(img, cax=cax)
plt.savefig('Ecuador/results/fig/OT_CNN-filtering_matching/OT_CNN-filtering_diameter_mapping_{}.png'.format(site_name))
plt.show()

#### Plot final species and diameter OT mapping for all sites

In [None]:
list_sites = ['Carlos Vera Arteaga RGB', 'Carlos Vera Guevara RGB', 'Leonor Aspiazu RGB', 
             'Manuel Macias RGB', 'Nestor Macias RGB', 'Flora Pluas RGB']

In [None]:
for site_name in list_sites:
    df_site = final[final.img_name == site_name]
    
    img_path = 'Ecuador/wwf_ecuador/RGB Orthomosaics/{}.tif'.format(site_name)

    fig, ax = plt.subplots(1, figsize = (15,15))
    im = mpimg.imread(img_path)

    ax = sns.scatterplot(x="X_d", y="Y_d", data=df_site, s = 50, hue="name", edgecolor='black',linewidth=1, palette = palette)
    ax.imshow(im)
    #plt.title('Species Prediction with Optimal Transport', fontsize=16, y = 1.05)
    plt.savefig('Ecuador/results/fig/OT_CNN-filtering_matching/OT_CNN-filtering_species_mapping_{}.png'.format(site_name))
    #plt.show()


    df_site['diameter'] = df_site['diameter'].replace(0.0, np.nan)
    df_site['diameter'].values[df_site['diameter'].values > 50] = np.nan
    
    fig, ax = plt.subplots(1, figsize = (15,15))
    im = mpimg.imread(img_path)
    ax.imshow(im)
    #ax.scatter(x=df_site['X_d'],y=df_site['Y_d'],c='black',s = 10)
    img = ax.scatter(x=df_site['X_d'],y=df_site['Y_d'],c=df_site['diameter'],cmap='copper', s = 60)
    #plt.title('Diameter Prediction with Optimal Transport', fontsize=16, y = 1.05)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(img, cax=cax)
    plt.savefig('Ecuador/results/fig/OT_CNN-filtering_matching/OT_CNN-filtering_diameter_mapping_{}.png'.format(site_name))
    plt.show()

In [None]:
heights = final['height'].unique()
for h in heights[1:]:
    df = final[final['height'] == h]
    print(df.name.to_numpy())

In [None]:
ground_data = pd.read_csv('Ecuador/features/initial_ground_data.csv')
diameters = ground_data['diameter'].unique()
for d in diameters[1:]:
    df = ground_data[ground_data['diameter'] == d]
    print(df.name.to_numpy())

## 3. Species Prediction

#### Effect of noise in ground data for Species classification accuracy - Ecuador real dataset

In [None]:
import seaborn as sns

In [None]:
final_res = pd.read_csv('Ecuador/results/classification_results_basic_small.csv', index_col = 0)
final_res = final_res.replace({"ot_greedy": "ot_GPS", "hand_ot_greedy": "hand_ot_GPS", "ot_cnn": "ot_GPS_and_species", "hand_ot_cnn": "hand_ot_GPS_and_species"})

In [None]:
final_res.groupby('method').mean()
final_res.groupby('method').var()

In [None]:
fig, ax = plt.subplots(figsize=[15, 7])
ax.set_ylabel('accuracy', size = 12)
ax.set_ylim(0,1)
ax.set_title('Classification Accuracy for the Ecuador naturally noisy Dataset', size = 16)

methods=["hand", "hand_nn", "nn", 'hand_gmn', 'gmn', 'hand_ot_GPS', 'ot_GPS', 'hand_ot_GPS_and_species', 'ot_GPS_and_species', 'hand_gw', 'gw']

sns.boxplot(x="method", y="accuracy", data=final_res, ax=ax, order = methods)
plt.setp(ax.get_xticklabels(), rotation=70, size=12)
ax.set(xlabel=None)

#### Binary species classification

In [None]:
ground_data = pd.read_csv('Ecuador/features/initial_ground_data.csv')

list_labels = ground_data.name.unique()
len(list_labels)
np.save('Ecuador/cnn/list_species.npy', list_labels)

In [None]:
final = pd.read_csv('Ecuador/results/Optimal Transport with CNN Greedy_final_matching.csv', index_col = 0)

In [None]:
final.columns

In [None]:
def get_patches(df):
    # read image, based on command line filename argument;
    # read the image as grayscale from the outset
    patches = []
    new_df = df.copy()

    for index, row in df.iterrows():
        path = 'Ecuador/images/{}/{}'.format(row.img_name, row.img_path)
        src = rasterio.open(path)
        raster = src.read()
        image = reshape_as_image(raster)
        tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)].astype(int)
        if tree.shape[0] > 0 and tree.shape[1] > 0:
            img = cv2.resize(np.array(tree, dtype = np.float32), (224, 224))
            patches.append(img)
        else:
            new_df = new_df.drop(index = index)
        plt.imshow(tree)
        plt.show()
    
    patches = np.array(patches, dtype = np.float32)
    new_df = new_df.reset_index(drop = True)
        
    return(patches, new_df)


def get_patches_and_labels(df):
    # read image, based on command line filename argument;
    # read the image as grayscale from the outset
    patches = []
    labels = []
    new_df = df.copy()

    for index, row in df.iterrows():
        path = 'Ecuador/images/{}/{}'.format(row.img_name, row.img_path)
        src = rasterio.open(path)
        raster = src.read()
        image = reshape_as_image(raster)
        tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)].astype(int)
        if tree.shape[0] > 0 and tree.shape[1] > 0:
            img = cv2.resize(np.array(tree, dtype = np.float32), (224, 224))
            patches.append(img)
            labels.append(row.name)
        else:
            new_df = new_df.drop(index = index)
        plt.imshow(tree)
        plt.show()
    
    patches = np.array(patches, dtype = np.float32)
    labels = np.array(labels)
    new_df = new_df.reset_index(drop = True)
        
    return(patches, new_df)


In [None]:
site_name = 'Flora Pluas RGB'

annot = pd.read_csv('Ecuador/annotations/{}_annotations_processed_cnn.csv'.format(site_name))
annot.head() 

In [None]:
patches, annot = get_patches(annot)
#np.save('Ecuador/cnn/test/patches_{}.npy'.format(site_name), patches)

In [None]:
labels = np.load('Ecuador/cnn/test/labels_{}.npy'.format(site_name))
annot['label'] = labels

In [None]:
annot.to_csv('Ecuador/cnn/test/predictions_{}.npy'.format(site_name))
annot.columns

In [None]:
site_name = 'Flora Pluas RGB'
annot = np.load('Ecuador/cnn/test/predictions_{}.npy'.format(site_name))
img_path = 'Ecuador/wwf_ecuador/RGB Orthomosaics/{}.tif'.format(site_name)

fig, ax = plt.subplots(1, figsize = (15,15))
im = mpimg.imread(img_path)

ax = sns.scatterplot(x="X", y="Y", data=annot, s = 50, hue="label", edgecolor='black',linewidth=1)
plt.axis('off')
ax.imshow(im)
plt.legend(loc="lower right", borderaxespad=0., fontsize=15, markerscale=1.5)
#plt.title('Species Prediction (Binary Classification)', fontsize=16, y = 1.05)
#plt.savefig('Ecuador/results/fig/Binary_species_prediction_{}.png'.format(site_name))
#plt.show()

In [None]:
for site_name in list_sites:
    annot = pd.read_csv('Ecuador/annotations/{}_annotations_processed_cnn.csv'.format(site_name), index_col = 0)
    
    img_path = 'Ecuador/wwf_ecuador/RGB Orthomosaics/{}.tif'.format(site_name)

    fig, ax = plt.subplots(1, figsize = (15,15))
    im = mpimg.imread(img_path)

    ax = sns.scatterplot(x="X_d", y="Y_d", data=df_site, s = 50, hue="name", edgecolor='black',linewidth=1, palette = palette)
    ax.imshow(im)
    plt.title('Species Prediction with Optimal Transport', fontsize=16, y = 1.05)
    plt.savefig('Ecuador/results/fig/OT_CNN-filtering_matching/OT_CNN-filtering_species_prediction_{}.png'.format(site_name))
    #plt.show()


    df_site['diameter'] = df_site['diameter'].replace(0.0, np.nan)
    df_site['diameter'].values[df_site['diameter'].values > 50] = np.nan
    
    fig, ax = plt.subplots(1, figsize = (15,15))
    im = mpimg.imread(img_path)
    ax.imshow(im)
    #ax.scatter(x=df_site['X_d'],y=df_site['Y_d'],c='black',s = 10)
    img = ax.scatter(x=df_site['X_d'],y=df_site['Y_d'],c=df_site['diameter'],cmap='copper', s = 60)
    plt.title('Diameter Prediction with Optimal Transport', fontsize=16, y = 1.05)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(img, cax=cax)
    plt.savefig('Ecuador/results/fig/OT_CNN-filtering_matching/OT_CNN-filtering_diameter_prediction_{}.png'.format(site_name))
    plt.show()

## NEON Dataset

### 1. Data Preparation

In [None]:
df_drone = pd.read_csv('NEON/annotations/final_annotations.csv', index_col = 0)
df_ground = pd.read_csv('NEON/features/final_ground_data.csv', index_col = 0)

In [None]:
list_labels = np.unique(df_ground.scientificName.to_numpy())
len(list_labels)
np.save('NEON/data/list_species.npy', list_labels)

In [None]:
df = pd.read_csv('NEON/data/field_data.csv', index_col = 0)
list_labels = np.unique(df.scientificName.to_numpy())
len(list_labels)
list_labels
#np.save('NEON/data/list_species.npy', list_labels)

In [None]:
list_sites1 = np.unique(df_drone['site'].to_numpy())
print(list_sites1)

In [None]:
list_sites2 = np.unique(df_ground['site'].to_numpy())
print(list_sites2)

In [None]:
# Visualise drone and ground data for the whole site TEAK

df1 = df_drone[df_drone['site']=='TEAK']
df2 = df_ground[df_ground['site']=='TEAK']

# Plot all points
X_drone = df1[['X', 'Y']].to_numpy()
print(len(X_drone))
X_ground = df2[['X', 'Y']].to_numpy()
print(len(X_ground))

plt.figure(figsize=(15,15))

for i in range(len(X_drone)):
    plt.scatter(x=X_drone[i][0], y=X_drone[i][1], c='r', s=40)
for i in range(len(X_ground)):
    plt.scatter(x=X_ground[i][0], y=X_ground[i][1], c='b', s=40)
plt.show()

In [None]:
# Visualise drone and ground data for the small tile ABBY_063

df1 = df_drone[df_drone['img_path']=='ABBY_063.tif']
df2 = df_ground[df_ground['img_path']=='ABBY_063.tif']

src = rasterio.open('NEON/images/ABBY_063.tif')
raster = src.read()
image = reshape_as_image(raster)

# Plot all points
X_drone = df1[['x', 'y']].to_numpy()
print(len(X_drone))
X_ground = df2[['x', 'y']].to_numpy()
print(len(X_ground))

plt.figure(figsize=(15,15))
plt.imshow(image.astype(int))
for i in range(len(X_drone)):
    plt.scatter(x=X_drone[i][0], y=X_drone[i][1], c='r', s=40)
for i in range(len(X_ground)):
    plt.scatter(x=X_ground[i][0], y=X_ground[i][1], c='b', s=40)
plt.show()

#### Build True Matching from Ground Data

In [None]:
df_ground = pd.read_csv('NEON/features/final_ground_data.csv', index_col = 0)

In [None]:
df_ground.columns

In [None]:
def convert_to_box(x, d):
    x['xmin'] = x['x']-d
    x['ymin'] = x['y']-d
    x['xmax'] = x['x']+d
    x['ymax'] = x['y']+d
    return(x)

df_ground = df_ground.apply(lambda l: convert_to_box(l, 40), axis = 1)
df_ground

In [None]:
for index, row in df_ground.iterrows():
    src = rasterio.open(os.path.join('NEON/images', row.img_path))
    raster = src.read()
    image = reshape_as_image(raster)
    tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)].astype(int)
    try:
        print(row.scientificName)
        plt.imshow(np.array(tree))
        plt.show()
    except ValueError:  #raised if `y` is empty.
        pass


In [None]:
df_ground.to_csv('NEON/data/true_matching.csv')

#### Build Species list

In [None]:
df_true = pd.read_csv('NEON/data/true_matching.csv', index_col = 0)

In [None]:
list_species = np.unique(df_true['scientificName'].to_numpy())
list_species_small = np.unique([x.split(' ')[0] for x in list_species])
np.save('NEON/data/list_species_reduced', list_species_small)

### 2. Mapping

In [None]:
df_true = pd.read_csv('NEON/data/true_matching.csv', index_col = 0)
df_true.columns

In [None]:
methods = ['NN', 'OT non greedy', 'OT greedy', 'GW']
sigmas = np.array([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4])

def get_final_matching_NEON(df_true, methods, sigmas):
    
    list_sites = np.unique(df_true['site'].to_numpy())
    df_drone = df_true[['x', 'y', 'site', 'xmin', 'ymin', 'xmax', 'ymax', 'X', 'Y',
       'itcLongitude', 'itcLatitude', 'img_path', 'X_left', 'Y_top']]
    df_ground = df_true[['X', 'Y', 'site', 'img_path', 'scientificName', 'stemDiameter',
           'measurementHeight', 'height']]
    
    n = len(df_true)
    for i in range(len(methods)):
        
        for k in range(len(sigmas)):
            noise = np.random.normal(0, sigmas[k], (n, 2))
            
            df_g = df_ground.copy()
            df_d = df_drone.copy()
            X_drone = df_d[['X', 'Y']]
            X_ground = X_drone + noise
            df_g[['X', 'Y']] = X_ground
            print(df_ground[['X', 'Y']].to_numpy()[0])
            print(df_g[['X', 'Y']].to_numpy()[0])
            
            if methods[i]=='NN':
                final = get_matching_baseline_NEON(list_sites, df_d, df_g)
                final.to_csv('NEON/results/final_{}_sigma_{}.csv'.format(methods[i], sigmas[k]))
                
            if methods[i]=='OT non greedy':
                G = get_map_NEON(list_sites, df_d, df_g, method = 'OT')
                final = get_matching_NEON(list_sites, df_d, df_g, G, greedy = False)
                final.to_csv('NEON/results/final_{}_sigma_{}.csv'.format(methods[i], sigmas[k]))
                
            if methods[i]=='OT greedy':
                G = get_map_NEON(list_sites, df_d, df_g, method = 'OT')
                final = get_matching_NEON(list_sites, df_d, df_g, G, greedy = True)
                final.to_csv('NEON/results/final_{}_sigma_{}.csv'.format(methods[i], sigmas[k]))
                
            if methods[i]=='GW':
                G = get_map_NEON(list_sites, df_d, df_g, method = 'GW')
                final = get_matching_NEON(list_sites, df_d, df_g, G, greedy = True)
                final.to_csv('NEON/results/final_{}_sigma_{}.csv'.format(methods[i], sigmas[k]))
                
    return

In [None]:
get_final_matching_NEON(df_true, methods, sigmas)

In [None]:
df = pd.read_csv('NEON/results/final_OT non greedy_sigma_1.5.csv', index_col = 0)
df.shape

### 3. Species Classifier

#### Effect of noise in ground data for Species classification accuracy - NEON synthetic dataset

In [None]:
final_res = pd.read_csv('NEON/results/classification_results_basic_small.csv', index_col = 0)
final_res = final_res.replace({"OT greedy": "OT"})

In [None]:
sns.set_theme()
sns.set_style("ticks")

plt.figure(figsize=(10,7))
ax = sns.lineplot(data=final_res, x="sigma", y="accuracy", hue="method", marker="o", palette = mypalette)

#ax.set_title('Classification Accuracy when Increasing Noise in GPS on a Synthetic Dataset\n (3839 trees of NEON Dataset)', size = 14)
#ax.set_ylim(0,1)
ax.set_xlabel("Sigma (m)",fontsize=15)
ax.set_ylabel("Accuracy",fontsize=15)
sns.despine()


#### Test species classifier on NEON tiles (ABBY_063, UNDE_006 and BART_050)

In [None]:
import matplotlib.image as mpimg
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
df_drone = pd.read_csv('NEON/annotations/final_annotations.csv', index_col = 0)
df_ground = pd.read_csv('NEON/features/final_ground_data.csv', index_col = 0)

In [None]:
df_drone.columns

In [None]:
def get_patches(df):
    patches = []
    new_df = df.copy()

    for index, row in df.iterrows():
        src = rasterio.open(os.path.join('NEON/images', row.img_path))
        raster = src.read()
        image = reshape_as_image(raster)
        tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)].astype(int)
        if tree.shape[0] > 0 and tree.shape[1] > 0:
            img = cv2.resize(np.array(tree, dtype = np.float32), (224, 224))
            patches.append(img)
        else:
            new_df = new_df.drop(index = index)
    
    patches = np.array(patches, dtype = np.float32)
    new_df = new_df.reset_index(drop = True)
        
    return(patches, new_df)


In [None]:
#img_name = 'ABBY_063.tif'
#img_name = 'UNDE_006.tif'
img_name = 'BART_050.tif'

df_img = df_drone[df_drone.img_path == img_name]
#patches, df_img = get_patches(df_img)
#np.save('NEON/cnn/test/patches_{}.npy'.format(img_name.replace('.tif','')), patches)

In [None]:
list_species = np.load('NEON/data/list_species_reduced.npy', allow_pickle=True)
print(len(list_species))
dico_species = {i:list_species[i] for i in range(len(list_species))}
rev_subs = { v:k for k,v in dico_species.items()}
print(rev_subs)

In [None]:
labels = np.load('NEON/cnn/test/labels_{}.npy'.format(img_name.replace('.tif','')))
df_img['label'] = labels

src = rasterio.open('NEON/images/{}'.format(img_name))
raster = src.read()
image = reshape_as_image(raster)

# Plot all points
fig, ax = plt.subplots(1, figsize = (15,15))
im = mpimg.imread('NEON/images/{}'.format(img_name))

ax = sns.scatterplot(x="x", y="y", data=df_img, s = 150, hue="label", edgecolor='black',linewidth=1, palette = palette)
plt.axis('off')
ax.imshow(im)

plt.legend(loc="lower right", borderaxespad=0., fontsize=20, markerscale=2)

In [None]:
final_true = pd.read_csv('NEON/data/true_matching.csv', index_col = 0)

final_true['name'] = final_true["scientificName"].apply(lambda x: x.split(' ')[0])
unique = final_true["name"].unique()
palette = dict(zip(unique, sns.color_palette(n_colors=len(unique)))) 

final_true.columns

In [None]:
#img_name = 'ABBY_063.tif'
#img_name = 'UNDE_006.tif'
img_name = 'BART_050.tif'

df_img = final_true[final_true.img_path ==img_name]

src = rasterio.open('NEON/images/{}'.format(img_name))
raster = src.read()
image = reshape_as_image(raster)

# Plot all points
fig, ax = plt.subplots(1, figsize = (15,15))
im = mpimg.imread('NEON/images/{}'.format(img_name))

ax = sns.scatterplot(x="x", y="y", data=df_img, s = 150, hue="name", edgecolor='black',linewidth=1, palette = palette)
plt.axis('off')
ax.imshow(im)
plt.legend(loc="lower right", borderaxespad=0., fontsize=20, markerscale=2)

In [None]:
from numpy import random

# Initial Ground measurements

for site_name in list_sites2:
    site = final_true[final_true.site == site_name]
    img_name = random.choice(site.img_path.unique())
    print(img_name)

    df_img = final_true[final_true.img_path ==img_name]

    src = rasterio.open('NEON/images/{}'.format(img_name))
    raster = src.read()
    image = reshape_as_image(raster)

    # Plot all points
    fig, ax = plt.subplots(1, figsize = (15,15))
    im = mpimg.imread('NEON/images/{}'.format(img_name))

    ax = sns.scatterplot(x="x", y="y", data=df_img, s = 100, hue="name", edgecolor='black',linewidth=1, palette = palette)
    ax.imshow(im)
    plt.title('Species Observations', fontsize=16, y = 1.05)
    plt.savefig('NEON/results/fig/species_observations_{}.png'.format(img_name))
