# Generate Dataset

The purpose of this file is to generate the pascal style dataset that we will use for our ML project. It is mostly automated, apart from having to deal with updates to errors made in the process of annotations

In [None]:
# show images inline
%matplotlib inline

# automatically reload modules when they have changed
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import json
import re
import zipfile
import tqdm
import time
import cv2

from math import floor, ceil

In [None]:
path_to_repo = '/Users/guillaumekugener/Documents/USC/USC_docs/ml/surgical-training-project/'

# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, os.path.join(path_to_repo, 'tools'))

In [None]:
from drive_dataset import (
    SurgicalVideoAnnotation, 
    extract_and_move_images, 
    create_retinanet_csv, 
    fix_S810T1b,
    add_missing_muscle,
    compile_reannotation_folder,
    replace_qc_frames,
    get_trial_validation_set, get_trial_test_set
    
)
from utils import plot_frame_with_bb, convert_frame_object_to_xml

In [None]:
import pandas as pd

The drive directory is the directory of the zip files with all of the annotations. We download this directly from the drive and decompress it. The variable below points to its location

In [None]:
from datetime import datetime

In [None]:
todays_date = datetime.today().strftime('%Y%m%d')

surgical_git_dir = '/Users/guillaumekugener/Documents/USC/USC_docs/ml/surgical-training-project/'

drive_dir = '/Users/guillaumekugener/Downloads/Completed Annotations 1 FPS/'
# Move the muscle patch only files (from the drive directory) into the directory below before running this script
muscle_patches_dir = '/Users/guillaumekugener/Downloads/muscle-patches/'
true_image_dir = '/Users/guillaumekugener/Downloads/1 FPS Reduced/'

final_dataset_directory = '/Users/guillaumekugener/Documents/USC/USC_docs/ml/datasets/fps-1-uncropped/'
csv_of_total_frames = '/Users/guillaumekugener/Documents/USC/USC_docs/ml/surgical-training-project/data/total_frames_1_fps.csv'

frame_rate = 1

manually_fixed_cases = os.path.join(surgical_git_dir, 'data/manually_fixed_annotations.csv')

### Compile reannotation data

There are a subset of annotations that we want to have reannotated. We want to be able to upload the current annotations in VOTT and edit them, rather than having to start from scratch. This function compiles our folder of interest for this task.

In [None]:
if False:
    compile_reannotation_folder(
        reannotation_csv = '/Users/guillaumekugener/Downloads/QC Frames Needing Attention - Sheet1.csv',
        image_directory = os.path.join(final_dataset_directory, 'JPEGImages'),
        complete_annotation_directory = drive_dir
    )

We also need to download the original frames to put into the dataset. Below, we first unzip all the files to count the total number of frames (in case we need to create empty annotation files for images that do not have annotations (because they have no objects). We copy the images into our dataset at the end of this notebook

In [None]:
images_zips = [i for i in os.listdir(true_image_dir) if re.search('\\.zip$', i)]
images_zips.sort()

total_frames = {
    'trial_id': [],
    'frames': []
}

for z in images_zips:
    trial_id = re.search('S[0-9]+T[0-9]+[ab]?', z).group(0)
    zf = zipfile.ZipFile(os.path.join(true_image_dir, z), 'r')
    total_frames['trial_id'].append(trial_id)
    total_frames['frames'].append(len([i for i in zf.namelist() if re.search('\\.jpeg$', i)]))
    zf.close()

pd.DataFrame(total_frames).to_csv(csv_of_total_frames, index=False)

In [None]:
total_frames = pd.read_csv(csv_of_total_frames)

In [None]:
all_zips = [i for i in os.listdir(drive_dir) if re.search('annotations\\.zip$', i)]
all_zips.sort()

In [None]:
# This is to run detection with yolo
if False:
    for i in all_zips:
        tid = re.sub('\\-.*', '', i)
        ds_name = 'fps-1-uncropped-20200914'
#         print(f"ffmpeg -framerate 1 -i /home/ec2-user/datasets/{ds_name}/JPEGImages/{tid}_frame_%08d.jpeg ./Videos/{tid}.mp4")
        print(f"python detect_video.py --classes /home/ec2-user/datasets/{ds_name}/classes.name --num_classes 8 --weights ./checkpoints/yolov3_train_11.tf --video /home/ec2-user/datasets/{ds_name}/Videos/{tid}.mp4 --output_stats /home/ec2-user/datasets/{ds_name}/yolo/{tid}_stats.pkl --yolo_score_threshold 0.01")

In [None]:
frames_to_fix = []
all_dataset_objects = []

In [None]:
# Iterate through all the trials and parse all the annotations we have so far
# all_zips = ['S314T1-annotations.zip']
redos = []
for z in tqdm.tqdm(all_zips):
    trial_id = re.search('S[0-9]+T[0-9]+[ab]?', z).group(0)
    frames_total = total_frames[total_frames['trial_id']==trial_id]['frames'].values[0]
    
    try:
        ex = SurgicalVideoAnnotation(
            trial_id=trial_id,
            total_frames=frames_total,
            file_path=os.path.join(drive_dir, z),
            output_directory=final_dataset_directory,
            delete_at_end=True,
            annotations_only=True,
            manually_fixed_cases=manually_fixed_cases
        )
    except:
        print('This trial failed' + z)
        redos.append(z)
        continue
    
    frames_to_fix = frames_to_fix + ex.too_many_tags_frames
    all_dataset_objects = all_dataset_objects + ex.frame_objects
    
#     time.sleep(0.5) # See if this fixes our bug...

In [None]:
if frame_rate == 10:
    all_dataset_objects.append({
        'name': 'S306T1_frame_00002212.jpeg',
        'x1': int((492+574)/2),
        'y1': int((260+370)/2),
        'x2': int((670+641)/2),
        'y2': 720,
        'class': 'suction'
    })
    print([i for i in all_dataset_objects if i['name'] in ['S306T1_frame_00002212.jpeg', 'S306T1_frame_00002213.jpeg']])

In [None]:
# Now we have to add the frames from the csvs
all_csv_annotations = [i for i in os.listdir(drive_dir) if re.search('\\.csv$', i)]
all_csv_annotations.sort()

Move the images into our dataset. We should only move the trial for which we have annotations above

In [None]:
extract_and_move_images(
    dir_with_image_zips=true_image_dir,
    output_directory=final_dataset_directory,
    trials_to_process=[re.sub('(\\.csv$)|(\\-.*)|(\\.zip$)', '', i) for i in all_zips + all_csv_annotations]
)

In [None]:
# Create file with image sizes for all of the trials
image_sizes_all = {'trial_id': [], 'w': [], 'h': []}
for tid in [re.sub('(\\.csv$)|(\\-.*)|(\\.zip$)', '', i) for i in all_zips + all_csv_annotations]:
    img_size = cv2.imread(os.path.join(final_dataset_directory, 'JPEGImages', f"{tid}_frame_00000001.jpeg")).shape
    image_sizes_all['trial_id'].append(re.sub('[a-z]$', '', tid))
    image_sizes_all['w'].append(img_size[1])
    image_sizes_all['h'].append(img_size[0])

In [None]:
trial_frame_sizes = pd.DataFrame(image_sizes_all).drop_duplicates().reset_index(drop=True)
trial_frame_sizes.to_csv(f"{final_dataset_directory}/image_sizes.csv")

In [None]:
# Loop through and make the new data
dataset_formatted = []
for cti in tqdm.tqdm(range(len(all_csv_annotations))):
    # Need to get the image size for this trial
    current_trial_id = re.sub('\\-.*', '', all_csv_annotations[cti])
    example_export = pd.read_csv(os.path.join(drive_dir, all_csv_annotations[cti]))

    # Some meta data we need
    n_frames = total_frames.loc[total_frames['trial_id'].str.contains(current_trial_id),'frames'].iloc[0]
    all_frames = [current_trial_id + '_frame_' + str(i).zfill(8) + '.jpeg' for i in range(1, 1+n_frames)]
    img_size = cv2.imread(os.path.join(final_dataset_directory, 'JPEGImages', example_export.at[0, 'image'])).shape

    frame_objects= []
    frames = {}
    for i in range(example_export.shape[0]):
        # Need the width and height
        # Need tools
        image = example_export.at[i,'image']
        xmin = floor(example_export.at[i,'xmin'])
        ymin = floor(example_export.at[i,'ymin'])
        xmax = min(img_size[1], ceil(example_export.at[i,'xmax']))
        ymax = min(img_size[0], ceil(example_export.at[i,'ymax']))
        label = example_export.at[i,'label']
    
    
        # Run some checks
        if label == 'undefined':
            print('Undefined label: ' + image)
            
        if len(label.split(',')) > 1:
            print('Multi-label: ' + image)
        
        
        if image not in frames:
            frames[image] = { 'name': image, 'height': img_size[0], 'width': img_size[1], 'tools': [] }

        frames[image]['tools'].append({'coordinates': [(xmin, ymin), (xmax, ymax)], 'type': label})
        
        dataset_formatted.append({
            'name': image, 
            'x1': xmin, 
            'y1': ymin, 
            'x2': xmax, 
            'y2': ymax, 
            'class': label
        })
        
    # Have to fill in blank
    missing_frames = [f for f in all_frames if f not in frames.keys()]
    for f in missing_frames:
        frames[f] = { 'name': f, 'height': img_size[0], 'width': img_size[1], 'tools': [] }
        dataset_formatted.append({'name': f, 'x1': '', 'y1': '', 'x2': '', 'y2': '', 'class': ''})

    # Now make the frame objects
    frame_objects = [frames[f] for f in frames]
        
    # And make their xmls
    for fo in frame_objects:
        convert_frame_object_to_xml(
            frame_obj=fo, 
            destination=os.path.join(final_dataset_directory, 'Annotations')
        )

In [None]:
all_objects_ds_df = pd.DataFrame(all_dataset_objects + dataset_formatted)
all_objects_ds_df.shape

In [None]:
all_objects_ds_df[all_objects_ds_df['name'].str.contains('S810T1b')].shape

Create the class map for the dataset below

In [None]:
class_map = pd.DataFrame({'class': all_objects_ds_df['class'].unique()})
class_map = class_map[class_map['class'] != '']

In [None]:
class_map.to_csv(os.path.join(final_dataset_directory, 'classes.name'), sep='\t', header=False, index=False)
class_map_retina = class_map.copy()
class_map_retina['index'] = [i for i in range(class_map_retina.shape[0])]
class_map_retina.to_csv(
    os.path.join(
        final_dataset_directory, 
        'retina_classes.csv'
    ), header=False, index=False)

We look for undefined objects. We then manually inspect and fix these and save the results to a csv. The csv is then used in the future to fix the labels so we do not have to deal with this manual process again

In [None]:
if all_objects_ds_df[all_objects_ds_df['class']=='undefined'].shape[0] > 0:
    print(f"There are {all_objects_ds_df[all_objects_ds_df['class']=='undefined'].shape[0]} undefined objects")
    all_objects_ds_df[all_objects_ds_df['class']=='undefined'].to_csv(os.path.join(surgical_git_dir, 'data', todays_date + '_fixed_annotations.csv'), index=False) # These are the ones we have to fix

In [None]:
# all_objects_ds_df[all_objects_ds_df['name']=='S109T1_frame_00000087.jpeg']

In [None]:
# Fix my naming mistakes
all_objects_ds_df = fix_S810T1b(
    image_dir = os.path.join(final_dataset_directory, 'JPEGImages'),
    annotations_dir = os.path.join(final_dataset_directory, 'Annotations'),
    complete_set_df = all_objects_ds_df.copy()
)

In [None]:
all_objects_ds_df.shape

In [None]:
# Muscle patch for a whole set of images were missing so adding them here
all_objects_ds_df = add_missing_muscle(
    muscle_annotations_path=muscle_patches_dir,
    image_dir = os.path.join(final_dataset_directory, 'JPEGImages'),
    annotations_dir= os.path.join(final_dataset_directory, 'Annotations'),
    complete_set_df = all_objects_ds_df.copy()
)

In [None]:
all_objects_ds_df.shape

In [None]:
# Replace the poorly annotated frames with the QC ones
all_objects_ds_df = replace_qc_frames(
    reannotation_csv = '/Users/guillaumekugener/Downloads/QC Frames Needing Attention - Sheet1.csv',
    qc_directory='/Users/guillaumekugener/Downloads/QC Annotations/QC-PascalVOC-export/Annotations',
    final_dataset_directory=final_dataset_directory,
    complete_set_df = all_objects_ds_df.copy()
)

In [None]:
all_objects_ds_df.shape

In [None]:
all_objects_ds_df = all_objects_ds_df.drop_duplicates()

In [None]:
# Not sure how these get added...
all_objects_ds_df = all_objects_ds_df[~all_objects_ds_df['name'].isin(['._S310T1_frame_00000003.jpeg', '._S310T1_frame_00000004.jpeg'])]

In [None]:
all_objects_ds_df = all_objects_ds_df.reset_index(drop=True)

In [None]:
all_objects_ds_df.shape

In [None]:
# Check that we have all of the frames and annotations in the dataset (we should not be missing anything)
for ti, tid in enumerate(total_frames['trial_id']):
    total_expected = total_frames.loc[ti, 'frames']
    total_actual = all_objects_ds_df[all_objects_ds_df['name'].str.contains(tid)]['name'].unique()
    
    if len(total_actual) != total_expected:
        print(f"{tid}. Expected: {total_expected}, actual: {len(total_actual)}")

In [None]:
## We can catch errors in the annotations below. We need to manually fix these
for i in range(len(frames_to_fix)):
    print(frames_to_fix[i]['name'])
    plot_frame_with_bb(
        image_path=os.path.join(final_dataset_directory, 'JPEGImages', frames_to_fix[i]['name']),
        annotation_path=os.path.join(final_dataset_directory, 'Annotations', re.sub('\\.jpeg$', '.xml', frames_to_fix[i]['name'])),
        only_undefined=False        
    )


In [None]:
frames_in_current_ds = sum(total_frames[total_frames['trial_id'].isin([re.sub('(\\-.*)|(\\.zip$)', '', i) for i in all_zips + all_csv_annotations])]['frames'])

In [None]:
print(f"Total frames in current version of ds: {frames_in_current_ds}")

In [None]:
all_frames_dataset = all_objects_ds_df['name'].unique()
all_frames_dataset.sort()

## QC Annotations

Below, we save all of the frames with the annotations include so that we can visually inspect

In [None]:
import progressbar

In [None]:
# If we want to QC are data (which we will want to do later, per recommendations made about clean data)
if False:
    for i, f in progressbar.progressbar(enumerate(all_frames_dataset)):
        # In case this has to stop for some reason
        if os.path.isfile(os.path.join(final_dataset_directory, 'AnnotationValidation', re.sub('_.*', '', f), f)):
            continue
    #     print(f)
        try:
            os.mkdir(os.path.join(final_dataset_directory, 'AnnotationValidation', re.sub('_.*', '', f)))
        except:
            pass
        plot_frame_with_bb(
            image_path=os.path.join(final_dataset_directory, 'JPEGImages', f),
            annotation_path=os.path.join(final_dataset_directory, 'Annotations', re.sub('\\.jpeg$', '.xml', f)),
            only_undefined=False,
            save_path=os.path.join(final_dataset_directory, 'AnnotationValidation', re.sub('_.*', '', f), f)
        )

## Define training, validation, test sets

Below, we create the training and validation csvs. For testing, we will use additional videos not in our original 46 (as this will have the least bias)

In [None]:
# These were randomly selected
test_trials = get_trial_test_set()

In [None]:
# These were randomly selected
validation_trials = get_trial_validation_set()

In [None]:
frames_relevant_o = {
    'train': [re.sub('\\.jpeg$', '', i) for i in all_frames_dataset if (re.sub('_.*', '', i) not in validation_trials) and (re.sub('_.*', '', i) not in test_trials)],
    'val': [re.sub('\\.jpeg$', '', i) for i in all_frames_dataset if re.sub('_.*', '', i) in validation_trials],
    'test': [re.sub('\\.jpeg$', '', i) for i in all_frames_dataset if re.sub('_.*', '', i) in test_trials]
}

trials_relevant = {}
for k in frames_relevant_o:
    trials_relevant[k] = [i for i in set([re.sub('_frame.*', '', j) for j in frames_relevant_o[k]])]

In [None]:
for g in ['train', 'val', 'test']:
    pascal_training_csv = pd.DataFrame({ 'name': frames_relevant_o[g], 'inc': 1})
    print(f"Dataset {g} size: {pascal_training_csv.shape[0]} frames")
    pascal_training_csv.to_csv(
        os.path.join(final_dataset_directory, 'ImageSets/Main', 'surgical_1fps_' + g + '.txt'),
        sep = '\t', header=False, index=False
    )
    
#     pascal_training_csv.head(100).to_csv(
#         os.path.join(final_dataset_directory, 'ImageSets/Main', 'small_surgical_1fps_' + g + '.txt'),
#         sep = '\t', header=False, index=False
#     )
    


In [None]:
# We have to convert the class label to an index first. 
# Read in the class.name file and use that order
class_map = pd.read_csv(os.path.join(final_dataset_directory, 'classes.name'), header=None)
class_dict_mapping = {}
for ki, k in enumerate(class_map[0]):
    class_dict_mapping[k] = ki


Below is for the retinanet data

In [None]:
trials_relevant

In [None]:
aws_dir_prefix = '/home/ec2-user/datasets/fps-1-uncropped-20210205/JPEGImages/'
local_dir_predix = '/Users/guillaumekugener/Documents/USC/USC_docs/ml/datasets/fps-1-uncropped/JPEGImages/'

# For AWS files
create_retinanet_csv(
    all_objects_ds_df=all_objects_ds_df,
    dir_prefix=aws_dir_prefix,
    final_dataset_directory=final_dataset_directory,
    csv_name='retinanet_surgical_1fps',
    grouping=trials_relevant
)

# For local files
create_retinanet_csv(
    all_objects_ds_df=all_objects_ds_df,
    dir_prefix=local_dir_predix,
    final_dataset_directory=final_dataset_directory,
    csv_name='local_retinanet_surgical_1fps',
    grouping=trials_relevant
)

# validation_indices = all_objects_ds_df['name'].str.contains('|'.join(validation_trials))
# retinanet_training_csv = all_objects_ds_df[~validation_indices].copy()
# retinanet_validation_csv = all_objects_ds_df[validation_indices].copy()

# retinanet_training_csv['name'] = dir_prefix + retinanet_training_csv['name']
# retinanet_validation_csv['name'] = dir_prefix + retinanet_validation_csv['name']

# # We need to set the full path    
# retinanet_training_csv.to_csv(
#     os.path.join(final_dataset_directory, 'ImageSets/Main', 'retinanet_surgical_1fps_train.csv'),
#     sep=',', header=False, index=False
# )
# retinanet_validation_csv.to_csv(
#     os.path.join(final_dataset_directory, 'ImageSets/Main', 'retinanet_surgical_1fps_validation.csv'),
#     sep=',', header=False, index=False
# )

## Stats

Gives an overview of the dataset (number of frames in training, validation, and number of tools, number of trials, etc...)

In [None]:
data_on_ds = {
    'Training': pd.read_csv(
        os.path.join(
            final_dataset_directory, 
            'ImageSets/Main', 
            'retinanet_surgical_1fps_train.csv'
        ), names=['file', 'x1', 'y1', 'x2', 'y2', 'class']),
    'Validation': pd.read_csv(
        os.path.join(
            final_dataset_directory, 
            'ImageSets/Main', 
            'retinanet_surgical_1fps_val.csv'
        ), names=['file', 'x1', 'y1', 'x2', 'y2', 'class']),
    'Testing': pd.read_csv(
        os.path.join(
            final_dataset_directory, 
            'ImageSets/Main', 
            'retinanet_surgical_1fps_test.csv'
        ), names=['file', 'x1', 'y1', 'x2', 'y2', 'class']),
}
output_string = ""
for g in data_on_ds:
    stat_df = data_on_ds[g][['x1','class']].groupby('class').agg(['count'])
    output_string += f"--- {g} info ---\n\n"
    for i in range(len(stat_df.values)):
        tool = stat_df.index.values[i]
        total = stat_df.values[i][0]
        if tool == '':
            tool = 'None'
        output_string += f"\t{tool}: {total} ({round(total/data_on_ds[g].shape[0]*100, 1)}%)\n"
    output_string += f"\nTotal frames: {len(set(data_on_ds[g]['file']))}\n\n"

    
text_file = open(os.path.join(final_dataset_directory, 'ImageSets/Main', 'stats_' + todays_date + '.txt'), "w")
text_file.write(output_string)
text_file.close()