# Data preparation for species classification - OneForest

Context: A 4-block CNN Model (called Model Basic) is used to do species classification: the resulting datasets obtained after mapping ground and drone data using Nearest Neighbours, Graph Matching Networks or an Optimal Transport method, is used for training the CNN to do species classification.

For training, we give as input image patches of the trees. The labels are the species (scientific name of the trees).
The notebook first defines functions to produce patches (as numpy arrays) and creates the patches and labels for Ecuador and NEON datasets.

In [None]:
import tensorflow as tf
import rasterio
from rasterio.plot import reshape_as_image

from keras.layers import Dense, Conv2D, MaxPooling2D, BatchNormalization, Flatten, Dropout, InputLayer, MaxPool2D
from tensorflow.keras import optimizers, models
from keras.applications.resnet50 import ResNet50
from keras.models import Model
from keras.models import Sequential
from keras import optimizers
import keras

from deepforest import utilities


In [None]:
import skimage.color
import skimage.io

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import cv2

import torch
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
torch.manual_seed

In [None]:
import sys
package = os.path.dirname(os.getcwd())
sys.path.append(package)
sys.path.append(package + '/utils')
sys.path
from utils.deepforest_detection import *

## Ecuador dataset

In [None]:
final = pd.read_csv('Ecuador/results/Gromov-Wasserstein Greedy_final_matching.csv', index_col = 0)

In [None]:
final_pos = final[final['is_musacea_g'] == 0]
final_neg = final[final['is_musacea_g'] == 1]

In [None]:
def convert_xml_to_csv(directory, name, train_on_resized = True):
    xml = get_data(os.path.join(os.getcwd(), directory,"{}.xml".format(name)))
    hand_annotations = utilities.xml_to_annotations(xml)
    hand_annotations.to_csv(os.path.join(os.getcwd(), directory,"hand_{}.csv".format(name)),index=False, header=None)
    return hand_annotations

def get_patches(path_to_img, boxes):
    # read image, based on command line filename argument;
    # read the image as grayscale from the outset
    patches = []

    image = skimage.io.imread(path_to_img)

    for index, row in boxes.iterrows():
        tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)]
        if tree.shape == (201,312,3):
            continue
        patches.append(tree)
    return(patches)

def resize_patches(patches):
    n = len(patches)
    print(n)
    resized_patches = []
    for j in range(n):
        resized_patch = cv2.resize(np.array(patches[j], dtype = np.float32), (200,200))
        resized_patches.append(resized_patch)
    return(np.array(resized_patches))

def patch_match(final):
    list_tiles = np.unique(final['img_path'])
    patches = []
    for path in list_tiles:
        sub = final[final.img_path == path]
        site = path.split('_')[0]
        path_to_img = os.path.join(os.getcwd(), 'Ecuador/images/{}/{}'.format(site, path))
        boxes = sub[['xmin', 'ymin', 'xmax', 'ymax']]
        patch = get_patches(path_to_img, boxes)
        b = np.array(patch)
        if b.shape[0] == 1 | len(b.shape)==3:
            continue
        patches.extend(patch)
    return(np.array(patches))

def get_patches_and_labels(directory, name, path_to_img, boxes):
    # read image, based on command line filename argument;
    # read the image as grayscale from the outset
    patches = []
    labels = []
    image = skimage.io.imread(path_to_img)

    for index, row in boxes.iterrows():
        tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)]
        tree = cv2.resize(tree, (200, 200))
        plt.imshow(tree)
        plt.show()
        labels.append(input('Is it a Musacea? '))
        patches.append(tree)
        
    patches = np.array(patches)
    labels = np.array(labels)
    np.save('cnn/test2/labels.npy', labels)
    np.save('cnn/test2/patches.npy', patches)
    
    boxes_test['labels'] = labels
    boxes_test.to_csv(os.path.join(os.getcwd(), directory,"hand_{}.csv".format(name)),index=False, header=None)
    return(patches, labels)

In [None]:
patches_pos = patch_match(final_pos)
patches_neg = patch_match(final_neg)


In [None]:
n_pos = len(patches_pos)
n_neg = len(patches_neg)
n_pos, n_neg

In [None]:
patches_pos = resize_patches(np.array(patches_pos))
patches_neg = resize_patches(np.array(patches_neg))

np.save('Ecuador/cnn/train_gw/patches_pos.npy', patches_pos)
np.save('Ecuador/cnn/train_gw/patches_neg.npy', patches_neg)

In [None]:
y_pos = np.array([0]*n_pos)
y_neg = np.array([1]*n_neg)

X = np.concatenate((patches_pos, patches_neg), axis=0)
y = np.concatenate((y_pos, y_neg), axis=0)

randomize = np.arange(len(X))
np.random.shuffle(randomize)
X = X[randomize]
y = y[randomize]

print(X.shape)
print(y.shape)


In [None]:
boxes_test = convert_xml_to_csv('cnn/test2', 'test_example_2', train_on_resized = False)

In [None]:
test_dir = 'cnn/test2'
path_to_img_test = os.path.join(os.getcwd(), 'cnn/test2/test_example_2.png')

# Label patches: 0 if musacea; 1 otherwise
X_test, labels_test = get_patches_and_labels('cnn/test2', 'test_example_2', path_to_img_test, boxes_test)  
X_test = X_test/255.

In [None]:
X_test = np.load('cnn/test/patches.npy')
y_true = np.load('cnn/test/labels.npy')
y_true = y_true.astype(int)

## NEON

In [None]:
def get_patches_and_labels_neon(final):
    # read image, based on command line filename argument;
    # read the image as grayscale from the outset
    patches = []
    labels = []
    
    if 'img_path_d' in final.columns:
        final = final.rename(columns = {'img_path_d': 'img_path'})

    for index, row in final.iterrows():
        src = rasterio.open(os.path.join('NEON/images', row.img_path))
        raster = src.read()
        image = reshape_as_image(raster)
        tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)].astype(int)
        patches.append(tree)
        labels.append(row.scientificName)
    return(patches, labels)

def get_height_labels_neon(final):
    labels = []
    
    if 'img_path_d' in final.columns:
        final = final.rename(columns = {'img_path_d': 'img_path'})

    for index, row in final.iterrows():
        labels.append(row.height)
    return(labels)

def get_diameter_labels_neon(final):
    labels = []
    
    if 'img_path_d' in final.columns:
        final = final.rename(columns = {'img_path_d': 'img_path'})

    for index, row in final.iterrows():
        labels.append(row.stemDiameter)
    return(labels)


def get_patches_and_labels_idtree(final):
    # read image, based on command line filename argument;
    # read the image as grayscale from the outset
    patches = []
    labels = []

    for index, row in final.iterrows():
        src = rasterio.open(os.path.join('IDTrees', row.rgb_path))
        raster = src.read()
        image = reshape_as_image(raster)
        if row.site_id == 'ESALQ':
            tree = image[:,:,:3].astype(int)
        else:
            tree = image[int(row.ymin):int(row.ymax), int(row.xmin):int(row.xmax)].astype(int)
        patches.append(tree)
        labels.append(row.scientific_name)
    return(patches, labels)


### Training patches and labels on NEON real (ground and drone data)

In [None]:
final = pd.read_csv('NEON/results/Nearest Neighbours_final_matching.csv', index_col = 0)
patches, labels = get_patches_and_labels_neon(final)
np.save('NEON/cnn/train/patches_nn.npy', patches)
np.save('NEON/cnn/train/labels_nn.npy', labels)

In [None]:
final = pd.read_csv('NEON/results/Optimal Transport Non-Greedy_final_matching.csv', index_col = 0)
patches, labels = get_patches_and_labels_neon(final)
np.save('NEON/cnn/train/patches_ot_non_greedy.npy', patches)
np.save('NEON/cnn/train/labels_ot_non_greedy.npy', labels)

In [None]:
final = pd.read_csv('NEON/results/Optimal Transport Greedy_final_matching.csv', index_col = 0)
patches, labels = get_patches_and_labels_neon(final)
np.save('NEON/cnn/train/patches_ot_greedy.npy', patches)
np.save('NEON/cnn/train/labels_ot_greedy.npy', labels)

In [None]:
final = pd.read_csv('NEON/results/Graph Matching Network_final_matching.csv', index_col = 0)
patches, labels = get_patches_and_labels_neon(final)
np.save('NEON/cnn/train/patches_gmn.npy', patches)
np.save('NEON/cnn/train/labels_gmn.npy', labels)

In [None]:
final = pd.read_csv('NEON/results/Gromov-Wasserstein Greedy_final_matching.csv', index_col = 0)
patches, labels = get_patches_and_labels_neon(final)
np.save('NEON/cnn/train/patches_gw.npy', patches)
np.save('NEON/cnn/train/labels_gw.npy', labels)

### Training patches and labels on NEON synthetic (noise added)

In [None]:
# Extract the NEON patches and species labels from the different datasets obtained after mapping ground and drone data
dir = 'NEON/results'
for file in os.listdir(dir):
    if file.startswith('final'):
        sigma = file.split('_')[3]
        method = file.split('_')[1]
        path_file = os.path.join(dir, file)
        final = pd.read_csv(path_file, index_col = 0)
        patches, labels = get_patches_and_labels_neon(final)
        np.save('NEON/cnn/train/patches_{}_sigma_{}.npy'.format(method, sigma), patches)
        np.save('NEON/cnn/train/labels_{}_sigma_{}.npy'.format(method, sigma), labels)

In [None]:
# Extract the height labels
dir = 'NEON/results'
for file in os.listdir(dir):
    if file.startswith('final'):
        sigma = file.split('_')[3]
        method = file.split('_')[1]
        path_file = os.path.join(dir, file)
        final = pd.read_csv(path_file, index_col = 0)
        
        heights = get_height_labels_neon(final)
        np.save('NEON/cnn/train/heights_{}_sigma_{}.npy'.format(method, sigma), heights)

In [None]:
# Extract the diameter labels
dir = 'NEON/results'
for file in os.listdir(dir):
    if file.startswith('final'):
        sigma = file.split('_')[3]
        method = file.split('_')[1]
        path_file = os.path.join(dir, file)
        final = pd.read_csv(path_file, index_col = 0)
        
        diameters = get_diameter_labels_neon(final)
        np.save('NEON/cnn/train/diameters_{}_sigma_{}.npy'.format(method, sigma), diameters)


In [None]:
# Testing Set correponds to the true matching
true_matching = pd.read_csv('NEON/data/true_matching.csv', index_col = 0)
            
new_df = true_matching.copy()
n = len(new_df)
N = int(n*0.2)
Test = []
for i in range(5):
    test = new_df.sample(n=N, random_state=1)
    Test.append(test)
    new_df = new_df.drop(test.index) 

In [None]:
for i in range(len(Test)):
    patches, labels = get_patches_and_labels_neon(Test[i])
    np.save('NEON/cnn/test/patches_test_{}.npy'.format(i+1), patches)
    np.save('NEON/cnn/test/labels_test_{}.npy'.format(i+1), labels)

In [None]:
for i in range(len(Test)):
    heights = get_height_labels_neon(Test[i])
    np.save('NEON/cnn/test/heights_test_{}.npy'.format(i+1), heights)

In [None]:
for i in range(len(Test)):
    diameters = get_diameter_labels_neon(Test[i])
    np.save('NEON/cnn/test/diameters_test_{}.npy'.format(i+1), diameters)

In [None]:
a = true_matching['height'].to_numpy()
b = true_matching['stemDiameter'].to_numpy()

a = a[~np.isnan(a)]
b = b[~np.isnan(b)]

print(len(a), len(b))


## Models

### BASIC 1

In [None]:
IMG_SIZE = 200


model = Sequential()

model.add(Conv2D(32, kernel_size=6, activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(256, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(256, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

### BASIC 2

In [None]:
# build a sequential model
model = Sequential()
model.add(InputLayer(input_shape=(200, 200, 3)))

# 1st conv block
model.add(Conv2D(25, (5, 5), activation='relu', strides=(1, 1), padding='same'))
model.add(MaxPool2D(pool_size=(2, 2), padding='same'))
# 2nd conv block
model.add(Conv2D(50, (5, 5), activation='relu', strides=(2, 2), padding='same'))
model.add(MaxPool2D(pool_size=(2, 2), padding='same'))
model.add(BatchNormalization())
# 3rd conv block
model.add(Conv2D(70, (3, 3), activation='relu', strides=(2, 2), padding='same'))
model.add(MaxPool2D(pool_size=(2, 2), padding='valid'))
model.add(BatchNormalization())
# ANN block
model.add(Flatten())
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dropout(0.25))
# output layer
model.add(Dense(units=1, activation='sigmoid'))


# compile model
#opt = tf.keras.optimizers.SGD(lr=0.01)
#opt = tf.keras.optimizers.Adam(learning_rate=0.1)
#opt = tf.keras.optimizers.RMSprop(lr=0.01)

model.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])




### RESNET

In [None]:
restnet = ResNet50(include_top=False, weights=None, input_shape=(200,200,3))
output = restnet.layers[-1].output
output = keras.layers.Flatten()(output)
restnet = Model(restnet.input, output=output)
for layer in restnet.layers:
    layer.trainable = True
restnet.summary()

model_resnet = Sequential()
model_resnet.add(restnet)
model_resnet.add(Dense(512, activation='relu', input_dim=(200,200,3)))
model_resnet.add(Dropout(0.3))
model_resnet.add(Dense(512, activation='relu'))
model_resnet.add(Dropout(0.3))
model_resnet.add(Dense(1, activation='sigmoid'))
model_resnet.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=2e-5),
              metrics=['accuracy'])
model_resnet.summary()



### MOBILENET

In [None]:
base_model = tf.keras.applications.MobileNetV2(include_top = False, weights=None, input_shape=(200, 200, 3))
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
preds =  tf.keras.layers.Dense(1, activation = tf.nn.sigmoid)(x)

model=tf.keras.Model(inputs=base_model.input,outputs=preds)


opt = tf.keras.optimizers.SGD(lr=0.01)
#opt = tf.keras.optimizers.Adam(learning_rate=0.1)
#opt = tf.keras.optimizers.RMSprop(lr=0.01)

model.compile(optimizer=opt, loss = 'binary_crossentropy', metrics = ['accuracy'])

### NEON BASIC

In [None]:
IMG_SIZE = 224

model = Sequential()
model.add(Conv2D(64, kernel_size=6, activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(BatchNormalization(momentum=0.01))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(63, activation='softmax'))

In [None]:
!pip install netron

In [None]:
tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)