#Ingest shapefiles and convert to lists of bounding boxes

**Before running this script, create a Google Drive folder with shapefiles of your VIA annotations and CNN outputs.

<a href="https://colab.research.google.com/github/gl7176/CNN_tools/blob/main/Compare_VIA_to_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
#####  <center> Be sure to update this hyperlink above if you clone and want to point to a different GitHub </center>

### Connect to our Google Drive folder and pull files
Note: when you run this it will give you a link that you must click. You must give Google some permissions, then copy a code into a box that comes up in the output section of this code.

If customizing this code, you will need to point the `drive_folder` variable to a URL for your shared google drive folder.

In [1]:
# set variable to the destination google drive folder you want to pull from
drive_folder = 'https://drive.google.com/drive/folders/1Nqsx27thqFaGyrBkyaLJAT4VPuhp2I3d'

# enter approximate length of your object (here, a seal), in meters
# (this variable is used to draw the box around each point)
object_length = 2.6

!pip install -U -q PyDrive
import os, csv
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('data')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters

# this bit points the code to that google drive folder
pointer = str("'" + drive_folder.split("/")[-1] + "'" + " in parents")

file_list = drive.ListFile(
    {'q': pointer}).GetList()

#    this bit pulls key files from the directory specified above
#    and checks that all necessary files are present


for f in file_list:
  # 3. Create & download by id.
  fname = os.path.join(local_download_path, f['title'])
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)
  print("Pulled file: " + fname)
  if fname.endswith(".shp"):
    ptfile = fname
  if fname.endswith(".json"):
    tiling_scheme_file = fname

Pulled file: data/VIA_test_tiles.prj
Pulled file: data/VIA_test_tiles.cpg
Pulled file: data/VIA_test_tiles.dbf
Pulled file: data/VIA_test_tiles.sbn
Pulled file: data/VIA_test_tiles.shx
Pulled file: data/VIA_test_tiles.shp.xml
Pulled file: data/VIA_test_tiles.sbx
Pulled file: data/VIA_test_tiles.shp
Pulled file: data/seal_detections.sbx
Pulled file: data/seal_detections.sbn
Pulled file: data/seal_detections.cpg
Pulled file: data/seal_detections.prj
Pulled file: data/seal_detections.shx
Pulled file: data/seal_detections.shp
Pulled file: data/seal_detections.dbf


In [2]:
!pip install geopandas
!pip install affine
!pip install rasterio

Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 5.1 MB/s 
[?25hCollecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 45.1 MB/s 
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.8.20-cp37-cp37m-manylinux1_x86_64.whl (15.4 MB)
[K     |████████████████████████████████| 15.4 MB 44.3 MB/s 
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.20 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1
Collecting affine
  Downloading affine-2.3.0-py2.py3-none-any.whl (15 kB)
Installing collected pack

In [3]:
import os, fiona, sys, numpy as np, geopandas
from osgeo import ogr
from natsort import natsorted

VIA_file = "data/VIA_test_tiles.shp"
CNN_file = "data/seal_detections.shp"

def shape2box (shpfile):
  box_list = []
  with fiona.open(shpfile) as c:
      shpfile_crs = c.crs
      for i, record in enumerate(c):
         bounding_box = []
         shpbox = record["geometry"]["coordinates"][0]
         #print(record)
         for i in reversed(shpbox[0:4]):
             bounding_box.append(list(i))
         entry = {"box":bounding_box, "class":record["properties"]["Detection"]}
         box_list.append(entry)
  return box_list

### Convert shapefile to bounding box list

In [21]:
boxlist1 = shape2box(VIA_file)
boxlist2 = shape2box(CNN_file)

boxes1 = [[elem['box'][2][0], elem['box'][0][0], elem['box'][2][1], elem['box'][0][1]] for elem in boxlist1 if 'box' in elem]
boxes2 = [[elem['box'][2][0], elem['box'][0][0], elem['box'][2][1], elem['box'][0][1]] for elem in boxlist2 if 'box' in elem]

detections1 = [elem['detection'] for elem in boxlist1 if 'detection' in elem]
detections2 = [elem['detection'] for elem in boxlist2 if 'detection' in elem]

bboxes1 = np.array(boxes1)
bboxes2 = np.array(boxes2)

if max(bboxes1[:, 2]) > max(bboxes2[:, 2]):
  boxesA = bboxes1
  boxesB = bboxes2
  option_var = 0
else:
  boxesA = bboxes2
  boxesB = bboxes1
  option_var = 1

# grab the coordinates of the bounding boxes

if boxesA[0,0] > boxesA[0,1]:
  x1_A, x2_A = boxesA[:, 1], boxesA[:, 0]
else:
  x1_A, x2_A = boxesA[:, 0], boxesA[:, 1]
if boxesA[0,2] > boxesA[0,3]:
  y1_A, y2_A = boxesA[:, 3], boxesA[:, 2]
else:
  y1_A, y2_A = boxesA[:, 2], boxesA[:, 3]

if boxesB[0,0] > boxesB[0,1]:
  x1_B, x2_B = boxesB[:, 1], boxesB[:, 0]
else:
  x1_B, x2_B = boxesB[:, 0], boxesB[:, 1]
if boxesB[0,2] > boxesB[0,3]:
  y1_B, y2_B = boxesB[:, 3], boxesB[:, 2]
else:
  y1_B, y2_B = boxesB[:, 2], boxesB[:, 3]



area_A = (x2_A - x1_A + 1) * (y2_A - y1_A + 1)
area_B = (x2_B - x1_B + 1) * (y2_B - y1_B + 1)
idxs_A = y2_A
idxs_B = list(range(0,len(y2_B)))

# sort the indexes
idxs_A = np.argsort(idxs_A)

# initialize the list of picked indexes
matched_A = []
matched_B = []
rejects_A = []
rejects_B = []

# keep looping while some indexes still remain in the indexes list
while len(idxs_A) > 0:
    # grab the last index in the indexes list and add the index value
    # to the list of picked indexes
    last = len(idxs_A) - 1
    i = idxs_A[last]

    # find the largest (x, y) coordinates for the start of the bounding
    # box and the smallest (x, y) coordinates for the end of the bounding
    # box

    xx1 = np.maximum(x1_A[i], x1_B[idxs_B])
    yy1 = np.maximum(y1_A[i], y1_B[idxs_B])
    xx2 = np.minimum(x2_A[i], x2_B[idxs_B])
    yy2 = np.minimum(y2_A[i], y2_B[idxs_B])

    # compute the width and height of the bounding box
    w, h = np.maximum(0, xx2 - xx1 + 1), np.maximum(0, yy2 - yy1 + 1)
    
    # compute the ratio of overlap          
    overlap = list((w * h) / area_B[idxs_B])
    if max(overlap) > 0.6:
      matched_A.append(i)
      matched_B.append(idxs_B[overlap.index(max(overlap))])
      idxs_B = np.delete(idxs_B, overlap.index(max(overlap)))
    else:
      rejects_A.append(i)
    idxs_A = np.delete(idxs_A, last)
rejects_B = list(idxs_B)

print(len(matched_A), len(matched_B), len(rejects_A), len(rejects_B))
print(len(boxesA), len(boxesB))


#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX#
# overlap is now given between each compared square and the overlap with target square.
# must identify the compared square with max overlap beyond acceptable threshold
# pull the indexed choice and the compared square into the matched list
# assign if none found, where the index goes
# when all are matched, assign where comparison remnants go
# in any case, remove index each round to either destination to achieve loop progress 

# achieve all sorting by shuttling indices around
# remember to conserve/send detection type as well by end product
# check detection type manually, if needed, in shapefiles and GIS

# return the index of the bounding boxes that were picked


343 343 217 93
560 436


In [27]:
if option_var == 1:
  f = geopandas.read_file(CNN_file)
  g = geopandas.read_file(VIA_file)
elif option_var == 0:
  g = geopandas.read_file(CNN_file)
  f = geopandas.read_file(VIA_file)

matched_A_out = f.iloc[matched_A]
rejects_A_out = f.iloc[rejects_A]
matched_B_out = g.iloc[matched_B]
rejects_B_out = g.iloc[rejects_B]

# Set output directory, create it if necessary
output_dir = 'outputs'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

matched_A_out.to_file("outputs/matched_A.shp")
rejects_A_out.to_file("outputs/rejects_A.shp")
matched_B_out.to_file("outputs/matched_B.shp")
rejects_B_out.to_file("outputs/rejects_B.shp")


In [28]:
# zip up the output directory into an archive for download
output_file_name = 'Step_5_{o}'.format(o=output_dir)
import subprocess
subprocess.call(['zip', '-r', output_file_name + '.zip', '/content/' + output_dir])

from google.colab import files
files.download(output_file_name + ".zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Output detections in VIA format

In [25]:
# add class info later, when we have it on-hand to work with
# class_category = "Age Class"

#2015_02_02_hay_island_flight03_s110rgb_jpeg_mosaic_group1---28.png,1613979,"{}",4,0,"{""name"":""rect"",""x"":615,""y"":927,""width"":66,""height"":32}","{""Age Class"":""Adult""}"
#2015_02_02_hay_island_flight03_s110rgb_jpeg_mosaic_group1---28.png,1613979,"{}",4,1,"{""name"":""rect"",""x"":959,""y"":917,""width"":39,""height"":26}","{""Age Class"":""Pup""}"
new_line = [["filename","file_size","file_attributes","region_count","region_id","region_shape_attributes","region_attributes"]]

filename = ""
for detection in entry_list:
    print(detection["box"])
    temp = []
    if filename != detection["tile_ID"]:
      filename = detection["tile_ID"]
      count = 0
    else:
      count += 1
    file_size = ""
    file_attributes = "{}"
    x1 = detection["box"][3][0]
    y1 = detection["box"][3][1]
    x2 = detection["box"][1][0]
    y2 = detection["box"][1][1]
    #print("x1={x1}, x2={x2}, y1={y1}, y2={y2}".format(x1=x1,x2=x2,y1=y1,y2=y2))
    region_shape_attributes = {"name":"rect", "x":x1, "y":y1, "width":x2-x1, "height":y2-y1}
    region_count = ""
    region_attributes = {}
    region_ID = count
    new_line.append([filename, file_size, file_attributes, region_count, region_ID, region_shape_attributes, region_attributes])

for k, x in enumerate(new_line):
  new_line[k][5],new_line[k][6] = str(x[5]).replace("'",'"'),str(x[6]).replace("'",'"')

# Set output directory, create it if necessary
output_dir = 'via_annotations'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# write out new VIA file with additional detections
with open(output_dir + '/new_VIA_annotations.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerows(new_line)

from google.colab import files
files.download(output_dir + '/new_VIA_annotations.csv')

NameError: ignored