#Ingest shapefiles and convert to lists of bounding boxes

**Before running this script, create a Google Drive folder with shapefiles of your VIA annotations and CNN outputs.

<a href="https://colab.research.google.com/github/gl7176/CNN_tools/blob/main/Compare_VIA_to_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
#####  <center> Be sure to update this hyperlink above if you clone and want to point to a different GitHub </center>

### Connect to our Google Drive folder and pull files
Note: when you run this it will give you a link that you must click. You must give Google some permissions, then copy a code into a box that comes up in the output section of this code.

If customizing this code, you will need to point the `drive_folder` variable to a URL for your shared google drive folder.

In [3]:
# set variable to the destination google drive folder you want to pull from
drive_folder = 'https://drive.google.com/drive/folders/1Nqsx27thqFaGyrBkyaLJAT4VPuhp2I3d'

# enter approximate length of your object (here, a seal), in meters
# (this variable is used to draw the box around each point)
object_length = 2.6

!pip install -U -q PyDrive
import os, csv
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters

# this bit points the code to that google drive folder
pointer = str("'" + drive_folder.split("/")[-1] + "'" + " in parents")

file_list = drive.ListFile(
    {'q': pointer}).GetList()

#    this bit pulls key files from the directory specified above
#    and checks that all necessary files are present


for f in file_list:
  # 3. Create & download by id.
  fname = os.path.join(local_download_path, f['title'])
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)
  print("Pulled file: " + fname)
  if fname.endswith(".shp"):
    ptfile = fname
  if fname.endswith(".json"):
    tiling_scheme_file = fname

Pulled file: data/VIA_test_tiles.prj
Pulled file: data/VIA_test_tiles.cpg
Pulled file: data/VIA_test_tiles.dbf
Pulled file: data/VIA_test_tiles.sbn
Pulled file: data/VIA_test_tiles.shx
Pulled file: data/VIA_test_tiles.shp.xml
Pulled file: data/VIA_test_tiles.sbx
Pulled file: data/VIA_test_tiles.shp
Pulled file: data/seal_detections.sbx
Pulled file: data/seal_detections.sbn
Pulled file: data/seal_detections.cpg
Pulled file: data/seal_detections.prj
Pulled file: data/seal_detections.shx
Pulled file: data/seal_detections.shp
Pulled file: data/seal_detections.dbf


In [4]:
!pip install geopandas
!pip install affine
!pip install rasterio

Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[?25l[K     |▎                               | 10 kB 12.2 MB/s eta 0:00:01[K     |▋                               | 20 kB 15.6 MB/s eta 0:00:01[K     |█                               | 30 kB 7.5 MB/s eta 0:00:01[K     |█▎                              | 40 kB 6.9 MB/s eta 0:00:01[K     |█▋                              | 51 kB 5.2 MB/s eta 0:00:01[K     |██                              | 61 kB 5.2 MB/s eta 0:00:01[K     |██▎                             | 71 kB 5.6 MB/s eta 0:00:01[K     |██▌                             | 81 kB 6.3 MB/s eta 0:00:01[K     |██▉                             | 92 kB 4.8 MB/s eta 0:00:01[K     |███▏                            | 102 kB 4.8 MB/s eta 0:00:01[K     |███▌                            | 112 kB 4.8 MB/s eta 0:00:01[K     |███▉                            | 122 kB 4.8 MB/s eta 0:00:01[K     |████▏                           | 133 kB 4.8 MB/s eta 0:0

In [71]:
import os, fiona, sys, numpy as np
from osgeo import ogr
from natsort import natsorted

VIA_file = "data/VIA_test_tiles.shp"
CNN_file = "data/seal_detections.shp"

def shape2box (shpfile):
  box_list = []
  with fiona.open(shpfile) as c:
      shpfile_crs = c.crs
      for i, record in enumerate(c):
         bounding_box = []
         shpbox = record["geometry"]["coordinates"][0]
         #print(record)
         for i in reversed(shpbox[0:4]):
             bounding_box.append(list(i))
         entry = {"box":bounding_box, "class":record["properties"]["Detection"]}
         box_list.append(entry)
  return box_list

### Convert shapefile to bounding box list

In [72]:
VIA_box_list = shape2box(VIA_file)
CNN_box_list = shape2box(CNN_file)

def ID_overlaps(boxlist1, boxlist2):
  boxes1 = [elem['box'] for elem in boxlist1 if 'box' in elem]
  boxes2 = [elem['box'] for elem in boxlist2 if 'box' in elem]

  detections1 = [elem['detection'] for elem in boxlist1 if 'detection' in elem]
  detections2 = [elem['detection'] for elem in boxlist2 if 'detection' in elem]

  bboxes1 = np.array(boxes1)
  bboxes2 = np.array(boxes1)

  # initialize the list of picked indexes
  pick = []

  # grab the coordinates of the bounding boxes
  x1_1, y1_1, x2_1, y2_1 = bboxes1[:, 0], bboxes1[:, 1], bboxes1[:, 2], bboxes1[:, 3]
  x1_2, y1_2, x2_2, y2_2 = bboxes2[:, 0], bboxes2[:, 1], bboxes2[:, 2], bboxes2[:, 3]

  # compute the area of the bounding boxes and grab the indexes to sort
  # (in the case that no probabilities are provided, simply sort on the
  # bottom-left y-coordinate)
  area = (x2 - x1 + 1) * (y2 - y1 + 1)
  idxs = y2

  # sort the indexes
  idxs = np.argsort(idxs)

  # keep looping while some indexes still remain in the indexes list
  while len(idxs) > 0:
      # grab the last index in the indexes list and add the index value
      # to the list of picked indexes
      last = len(idxs) - 1
      i = idxs[last]
      pick.append(i)

      # find the largest (x, y) coordinates for the start of the bounding
      # box and the smallest (x, y) coordinates for the end of the bounding
      # box
      xx1, yy1, xx2, yy2 = np.maximum(x1[i], x1[idxs[:last]]), np.maximum(y1[i], y1[idxs[:last]]), np.minimum(x2[i], x2[idxs[:last]]), np.minimum(y2[i], y2[idxs[:last]])

      # compute the width and height of the bounding box
      w, h = np.maximum(0, xx2 - xx1 + 1), np.maximum(0, yy2 - yy1 + 1)

      # compute the ratio of overlap
      overlap = (w * h) / area[idxs[:last]]

      # delete all indexes from the index list that have overlap greater
      # than the provided overlap threshold
      idxs = np.delete(idxs, np.concatenate(([last],
          np.where(overlap > overlapThresh)[0])))

  # return the index of the bounding boxes that were picked
  return pick

In [224]:
boxlist1 = VIA_box_list
boxlist2 = CNN_box_list

boxes1 = [[elem['box'][2][0], elem['box'][0][0], elem['box'][2][1], elem['box'][0][1]] for elem in boxlist1 if 'box' in elem]
boxes2 = [[elem['box'][2][0], elem['box'][0][0], elem['box'][2][1], elem['box'][0][1]] for elem in boxlist2 if 'box' in elem]

detections1 = [elem['detection'] for elem in boxlist1 if 'detection' in elem]
detections2 = [elem['detection'] for elem in boxlist2 if 'detection' in elem]

bboxes1 = np.array(boxes1)
bboxes2 = np.array(boxes2)

if max(bboxes1[:, 2]) > max(bboxes2[:, 2]):
  boxesA = bboxes1
  boxesB = bboxes2
  option_var = 0
else:
  boxesA = bboxes2
  boxesB = bboxes1
  option_var = 1

# grab the coordinates of the bounding boxes
# note: each polygon in this instance has boxes in different orders (VIA vs. CNN): consider generalizing with if:then clauses
x1_A, x2_A, y1_A, y2_A = boxesA[:, 1], boxesA[:, 0], boxesA[:, 3], boxesA[:, 2]
x1_B, x2_B, y1_B, y2_B = boxesB[:, 0], boxesB[:, 1], boxesB[:, 2], boxesB[:, 3]
print(x1_A[0:3], x2_A[0:3], y1_A[0:3], y2_A[0:3])
print(x1_B[0:3], x2_B[0:3], y1_B[0:3], y2_B[0:3])
area_A = (x2_A - x1_A + 1) * (y2_A - y1_A + 1)
area_B = (x2_B - x1_B + 1) * (y2_B - y1_B + 1)
idxs_A = y2_A
idxs_B = y2_B

# sort the indexes
idxs_A = np.argsort(idxs_A)
idxs_B = np.argsort(idxs_B)

# initialize the list of picked indexes
matches = []
rejects_A = []
rejects_B = []

# keep looping while some indexes still remain in the indexes list
while len(idxs_A) > 0:
    # grab the last index in the indexes list and add the index value
    # to the list of picked indexes
    last = len(idxs_A) - 1
    i = idxs_A[last]

    # find the largest (x, y) coordinates for the start of the bounding
    # box and the smallest (x, y) coordinates for the end of the bounding
    # box

    xx1 = np.maximum(x1_A[i], x1_B)
    yy1 = np.maximum(y1_A[i], y1_B)
    xx2 = np.minimum(x2_A[i], x2_B)
    yy2 = np.minimum(y2_A[i], y2_B)

    # compute the width and height of the bounding box
    w, h = np.maximum(0, xx2 - xx1 + 1), np.maximum(0, yy2 - yy1 + 1)
    
    # compute the ratio of overlap          
    overlap = (w * h) / area_B
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX#
# overlap is now given between the compared square and the overlap with target square.
# must identify the compared square with max overlap beyond acceptable threshold
# pull the indexed choice and the compared square into the matched list
# assign if none found, where the index goes
# when all are matched, assign where comparison remnants go
# in any case, remove index each round to either destination to achieve loop progress 
    # delete all indexes from the index list that have overlap greater
    # than the provided overlap threshold
#    idxs_A = np.delete(idxs_A, np.concatenate(([last],
#         np.where(overlap > 0.6)[0])))
    idxs_A = np.delete(idxs_A, last)

# return the index of the bounding boxes that were picked


[291961.12897 292189.54747 291901.14653] [291964.20036 292192.2059  291904.32116] [5099902.4407  5099905.5379  5099913.74548] [5099904.66036 5099907.73175 5099915.73285]
[292296.81383 292138.46948 291913.12237] [292298.00109 292139.63093 291914.05153] [5100306.00586 5099913.17766 5099932.48354] [5100308.1739  5099914.77788 5099934.10957]
0.8794899153025963
0.10272827142280587
0.03354650662997403
0.056474184765067256
0.9460314304946028
0.06527891299494665
0.27631128938625626
0.18931830681663667
0.8966158980379368
0.8940890901045616
0.33447365639598703
0.30603081005433724
0.09276357146555661
0.8268861864083301
0.1344528523019065
0.7609275869873934
0.9032982974799373
0.23567809833383585
0.15510966687707725
0.008865324160449316
0.10646848486369691
0.006496102667548833
0.05584754800206752
0.009307066964023527
0.7143671564047374
0.33514758598807076
0.6731124535423945
0.35623253080341905
0.6567415356039299
0.4374045321208857
0.9106652516328531
1.0
0.2565845939913706
0.9350315903867795
0.86010

### Output detections in VIA format

In [None]:
# add class info later, when we have it on-hand to work with
# class_category = "Age Class"

#2015_02_02_hay_island_flight03_s110rgb_jpeg_mosaic_group1---28.png,1613979,"{}",4,0,"{""name"":""rect"",""x"":615,""y"":927,""width"":66,""height"":32}","{""Age Class"":""Adult""}"
#2015_02_02_hay_island_flight03_s110rgb_jpeg_mosaic_group1---28.png,1613979,"{}",4,1,"{""name"":""rect"",""x"":959,""y"":917,""width"":39,""height"":26}","{""Age Class"":""Pup""}"
new_line = [["filename","file_size","file_attributes","region_count","region_id","region_shape_attributes","region_attributes"]]

filename = ""
for detection in entry_list:
    print(detection["box"])
    temp = []
    if filename != detection["tile_ID"]:
      filename = detection["tile_ID"]
      count = 0
    else:
      count += 1
    file_size = ""
    file_attributes = "{}"
    x1 = detection["box"][3][0]
    y1 = detection["box"][3][1]
    x2 = detection["box"][1][0]
    y2 = detection["box"][1][1]
    #print("x1={x1}, x2={x2}, y1={y1}, y2={y2}".format(x1=x1,x2=x2,y1=y1,y2=y2))
    region_shape_attributes = {"name":"rect", "x":x1, "y":y1, "width":x2-x1, "height":y2-y1}
    region_count = ""
    region_attributes = {}
    region_ID = count
    new_line.append([filename, file_size, file_attributes, region_count, region_ID, region_shape_attributes, region_attributes])

for k, x in enumerate(new_line):
  new_line[k][5],new_line[k][6] = str(x[5]).replace("'",'"'),str(x[6]).replace("'",'"')

# Set output directory, create it if necessary
output_dir = 'via_annotations'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# write out new VIA file with additional detections
with open(output_dir + '/new_VIA_annotations.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerows(new_line)

from google.colab import files
files.download(output_dir + '/new_VIA_annotations.csv')