# goal of this notebook is to convert from .json to useful .csv files

In [1]:
####################################
#largely taken from convert-data.py#
####################################

import csv
import json
import argparse
from pathlib import Path
from PIL import Image

DEFAULT_CSV = "/home/egoodman/multitaskmodel/data/raw_data_tools.csv"

SAIL_IMAGES_PATH = "/home/egoodman/multitaskmodel/data/images/"
SAIL_JSON_PATH = "/home/egoodman/multitaskmodel/data/jsons/complete_data.json"

In [2]:
def get_coordinates(position, img_width, img_height):
    left = position["left"]
    top = position["top"]
    width = position["width"]
    height = position["height"]
    x1 = int(float(left) * img_width)
    y1 = int(float(top) * img_height)
    x2 = int((float(left) + float(width)) * img_width)
    y2 = int((float(top) + float(height)) * img_height)

    return [str(x1), str(y1), str(x2), str(y2)]

In [3]:
def convert(images_path, json_path, selected_tool, ignore_negatives, acceptable, ignore_annotator, hands, ignore_chirality):
    jf = open(json_path)
    cf = open(DEFAULT_CSV, 'w')
    filewriter = csv.writer(cf, delimiter=',')

    json_data = json.load(jf)['data']
    for data in json_data:
        if data["object_type"] == "image" and data["id"]:
            if ignore_annotator is not None and data["original_annotator_name"] == ignore_annotator:
                continue
            
            objects_in_image = 0
            filename = data["name"]
            vid_id = data['video_id']
            
            if acceptable is not None and vid_id not in acceptable:
                continue
            
            img_width, img_height = Image.open(images_path + filename).size
            
            if not hands and "tool_labels" in data:
                for tool_label in data["tool_labels"]:
                    if tool_label["category"] == "scalpel": 
                        pass 
                    elif selected_tool is None or tool_label["category"] == selected_tool:
                        objects_in_image += 1
                        line = [images_path + filename]
                        line += get_coordinates(tool_label["bounding_box_position"], img_width, img_height)
                        line.append(tool_label["category"])
                        filewriter.writerow(line)

            if 300==400 and "hand_labels" in data:
                for hand_label in data["hand_labels"]:
                    objects_in_image += 1
                    line = [images_path + filename]
                    line += get_coordinates(hand_label["bounding_box_position"], img_width, img_height)
                    line.append('hand')
                    filewriter.writerow(line)

            # Case were tools are not present in the image - add negative label
            if not ignore_negatives and objects_in_image == 0:
                filewriter.writerow([images_path + filename, '', '', '', '', ''])

In [4]:
def main():
    parser = argparse.ArgumentParser(description='Script to convert data into csv format for pytorch-retinanet.')

    parser.add_argument('--datapath', help='Path to json annotations')
    parser.add_argument('--imagepath', help='Path to image directory')
    parser.add_argument('--use_local', help='Use pre-loaded LOCAL_IMAGES_PATH (check convert_data.py)', action="store_true")
    parser.add_argument('--focus_tool', help='Only use annotations for one particular tool')
    parser.add_argument('--quality_control', action='store_true')
    parser.add_argument('--ignore_negatives', action='store_true')
    parser.add_argument('--ignore_annotator')
    parser.add_argument('--hands', action='store_true')
    parser.add_argument('--ignore_chirality', action='store_true')
    parser.add_argument('--aws', action='store_true')

    args, leftover = parser.parse_known_args("")
    
    images_path = SAIL_IMAGES_PATH
    json_path = SAIL_JSON_PATH

    if args.imagepath is not None:
        images_path = args.imagepath

    if args.datapath is not None:
        json_path = args.datapath

    if args.use_local:
        images_path = LOCAL_IMAGES_PATH
        json_path = LOCAL_JSON_PATH

    if args.aws:
        images_path = AWS_IMAGES_PATH
        json_path = AWS_DATA_PATH
        
    tool = args.focus_tool
    if tool is not None:
        print("Focusing on tool: " + tool)

    acceptable_videos = None
    if args.quality_control:
        acceptable_videos = build_acceptable_videos(json_path)
        
    print("Converting json data from " + json_path)
    convert(images_path, json_path, tool, args.ignore_negatives, acceptable_videos, args.ignore_annotator, args.hands, args.ignore_chirality)
    print("Converted data saved under " + DEFAULT_CSV)

if __name__ == "__main__":
    main()

Converting json data from /home/egoodman/multitaskmodel/data/jsons/complete_data.json
Converted data saved under /home/egoodman/multitaskmodel/data/raw_data_tools.csv


In [5]:
"""
Splits data into training, test, and validation sets
"""
import pandas as pd
from pathlib import Path

def extract_video(image_path):
    filename = image_path.split('/')[-1]
    return '-'.join(filename.split('-')[:-1])

pd.set_option('precision', 0)

DEFAULT_RAW_DATA = "/home/egoodman/multitaskmodel/data/raw_data_tools.csv"

DEFAULT_TRAIN = "/home/egoodman/multitaskmodel/data/train_data_tools.csv"
DEFAULT_VAL = "/home/egoodman/multitaskmodel/data/val_data_tools.csv"
DEFAULT_TEST = "/home/egoodman/multitaskmodel/data/test_data_tools.csv"

TEST_SPLIT = "/home/egoodman/multitaskmodel/data/test_split.csv"
TRAIN_SPLIT = "/home/egoodman/multitaskmodel/data/train_split.csv"

df = pd.read_csv(DEFAULT_RAW_DATA, dtype=str, header=None)

train_vids = pd.read_csv(TRAIN_SPLIT, dtype=str)
val_vids = train_vids.sample(frac=0.15)
train_vids = train_vids.drop(val_vids.index)
test_vids = pd.read_csv(TEST_SPLIT, dtype=str)
vid_ids = df[0].map(extract_video)

train_mask = [vid_id in train_vids["video_id"].tolist() for vid_id in vid_ids]
train = df[train_mask]
val_mask = [vid_id in val_vids["video_id"].tolist() for vid_id in vid_ids]
val = df[val_mask]
test_mask = [vid_id in test_vids["video_id"].tolist() for vid_id in vid_ids]
test = df[test_mask]

train.to_csv(DEFAULT_TRAIN, index=False, header=False)
val.to_csv(DEFAULT_VAL, index=False, header=False)
test.to_csv(DEFAULT_TEST, index=False, header=False)