# Subset and convert VIA annotations CSV file to RetinaNet CSV format

#### Before running this script, make sure that your Google Drive folder contains the tiles you created (step 1) and the annotations CSV that you exported from VIA (step 2) and no other CSV or PNG files (if multiple CSV files are present, you will need to modify code to point to the exact file). It's fine if the orthomosaic and JSON files are in there (they will be ignored).

<a href="https://colab.research.google.com/github/gl7176/GreySealCNN/blob/master/3_VIA_to_RetinaNet_subsetted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
#####  <center> Be sure to update this hyperlink above if you clone and want to point to a different GitHub </center>

### Connect to our Google Drive folder and pull annotation CSV
Note: when you run this it will give you a link that you must click. You must give Google some permissions, then copy a code into a box that comes up in the output section of this code.

In [None]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('VIA_annotations')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters

# set variable to the destination google drive folder you want to pull from
drive_folder = 'https://drive.google.com/drive/folders/1INuRNVKvKMy8L_Nb6lmoVbyvScWK0-0D'

# this bit points the code to that google drive folder
pointer = str("'" + drive_folder.split("/")[-1] + "'" + " in parents")

file_list = drive.ListFile(
    {'q': pointer}).GetList()

# this bit examines every file in the directory specified above and pulls the first CSV file it finds
# it also compiles the list of all images

# if there are multiple CSV files present it will spit an error and you should modify the code
# to point to the intended CSV file and re-run it

annotations_file = {}
image_list = []

count = 0
for f in file_list:
  count += 1
  if count % 10 == 0:
    print(count)
  # 3. Create & download CSV annotations file
  fname = os.path.join(local_download_path, f['title'])
  if fname.endswith(".png"):
    image_list.append(fname.split("/")[1])
  if fname.endswith(".csv"): 
      if len(annotations_file) != 0:
            if fname.endswith("classes.csv") or fname.endswith("subset_list.csv"):
              pass
            else:
              raise Exception("more than one annotations file identified (" + fname + " was unexpected)")
      annotations_file = fname
      f_ = drive.CreateFile({'id': f['id']})
f_.GetContentFile(annotations_file)
print("annotations file identified as " + annotations_file)

### Set up the python environment

In [None]:
# import necessary modules
import os
import csv
import random

# if running code on a local machine, manually point to the annotations_file
annotations_file = annotations_file

# set pseudo-random values for replicability
random.seed(1)

# use this variable to set output directory
output_dir = 'RetinaNet_annotations'

# create the dir if it doesn't already exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Shuffle and split images into 3 datasets: Training, Testing, Validation

In [11]:
# shuffle the image list randomly and get total count
random.shuffle(image_list)
total_count = len(image_list)

# set indices for breaking up the total dataset into TTV parts
test_fraction = 0.1
valid_fraction = 0.04
train_fraction = 0.86

# spit error if the math don't add up
if (sum([test_fraction, valid_fraction, train_fraction]) != 1.0):
   raise Exception("fractions should add up to 1")

test_index = int(total_count * test_fraction)
valid_index = int(total_count * (test_fraction + valid_fraction))

# use indices to break up dataset into TTV parts
test_dataset = image_list[:test_index]
valid_dataset = image_list[test_index:valid_index]
train_dataset = image_list[valid_index:]
print(len(test_dataset), len(valid_dataset), len(train_dataset))

# spit out CSV listing the image subsets
subset_list = []
for row in test_dataset:
        new_row = []
        new_row.append(row)
        new_row.append("test")
        subset_list.append(new_row)
for row in valid_dataset:
        new_row = []
        new_row.append(row)
        new_row.append("validation")
        subset_list.append(new_row)
for row in train_dataset:
        new_row = []
        new_row.append(row)
        new_row.append("training")
        subset_list.append(new_row)
with open(output_dir + '/subset_list.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerows(subset_list)

### Reformat annotations from VIA to RetinaNet format
The following loop pulls each annotation, line-by-line, from the VIA exported CSV, extracts the necessary information, reformats it into the format that RetinaNet requires (https://github.com/fizyr/keras-retinanet#annotations-format), then reassembles a new CSV line-by-line that RetinaNet can receive

In [5]:
# Create blank variable for each annotations list as we build it
image_annotations_train = []
image_annotations_test = []
image_annotations_valid = []

# Create blank list for class names
class_list = []

# read each line, parse it, convert it, put it all back together
# then drop it in the appropriate subset
with open(annotations_file, "r") as f:
    reader = csv.reader(f, delimiter=",")
    for line in reader: 
        # output we want:
        # format: path/to/image.jpg,x1,y1,x2,y2,class_name
        # example: /data/imgs/img_001.jpg,837,346,981,456,cow
        if 'filename' in line[0]:
            # bypassing comments in csv
            continue
        if '{}' in line[5]:
            #bypassing empty images
            continue
            
        filename = line[0]
        
        # pulling from column named "region_shape_attributes"
        box_entry = list(str(line[5]).strip('}{').split(','))
        box_entry = [i.split(':')[1] for i in box_entry]
 
        # strip brackets, split and get only the values we care about, then convert all the string to int 
        top_left_x, top_left_y, width, height = list(map(int,list(map(float, box_entry[1:5]))))
        if width == 0 or height == 0:
            continue
            # skip tiny/empty boxes
        
        # convert from "top left and width/height" to "x and y values at each corner of the box"
        if top_left_x < 0:
            top_left_x = 1
        if top_left_y < 0:
            top_left_y = 1
        x1 = top_left_x
        x2 = top_left_x + width
        y1 = top_left_y
        y2 = top_left_y + height 
        
        # pulling from column named "region_attributes" to get class names
        name = list(str(line[6]).strip('}{').split(':'))[1].strip('"')

        # skip unknown class, in this case. Might be useful in other applications though, e.g. total count
        if name == "Unknown":
            continue

        # build list of classes as we encounter new names
        if name not in class_list:
            class_list.append(name)

          # create the annotation row
        new_row = []
        new_row.append(filename)
        new_row.append(x1)
        new_row.append(y1)
        new_row.append(x2)
        new_row.append(y2)
        new_row.append(name)
        
        # append the row to the correct subset (training, testing, or validation)
        if filename in train_dataset:
            image_annotations_train.append(new_row)
        elif filename in test_dataset:
            image_annotations_test.append(new_row)
        else:
            image_annotations_valid.append(new_row)

NameError: name 'annotations_file' is not defined

### Output annotations.csv and classes.csv

In [42]:
with open(output_dir + '/annotations_train.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerows(image_annotations_train)

with open(output_dir + '/annotations_test.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerows(image_annotations_test)

with open(output_dir + '/annotations_valid.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerows(image_annotations_valid)

In [43]:
# this bit uses our class_list (built during annotations processing) to create our classes file
# note again that "unknown" ambiguous cases have been excluded in this case

detection_classes = []

for i in range(0, len(class_list)):
    detection_classes.append([class_list[i], i])

with open(output_dir + '/classes.csv', 'w', newline='') as fp:
    writer = csv.writer(fp)
    writer.writerows(detection_classes)

#### Zip data folder for download

In [None]:
# zip up the output directory into an archive for download
import subprocess
subprocess.call(['zip', '-r', '/content/' + output_dir + '.zip', '/content/' + output_dir])

from google.colab import files
files.download("/content/" + output_dir + ".zip")

##### At the end of this script you should have downloaded 3 TXT files (T/T/V) and 4 CSV files (T/T/V + classes). Drop these all in the google directory so they can be ingested by our CNN code in the next step.

Next steps:

4) train, refine, and test CNN using VIA annotations and the tiles generated here

5) export CNN outputs

##### Not a code issue, but I recommend manually checking each annotations document to make sure that there are a reasonable number of annotations in each dataset; it is possible that the random breakdown could pick a cluster of "empty" images for validation or training. Should not be an issue in this specific case because I've checked it for our random seed, but in future applications this is a good idea.