# Yolo post processing annotation clean-up

The purpose of this notebook is to develop the model that takes yolo outputed bounding box predictions, concatenates them into sequences of consecutive frames, and then runs sequences through an RNN

## Module imports

In [None]:
import pickle
import os
import numpy as np
import pandas as pd
import copy
from collections import deque
import random
import time

from math import floor

from lxml import etree
import re
import tqdm

import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import cv2

In [None]:
%load_ext autoreload
%autoreload 2

We include the chunk below in order to import that developped classes that include the data generator and then RNN model definition

In [None]:
path_to_repo = '/Users/guillaumekugener/Documents/USC/USC_docs/ml/surgical-training-project/'

import sys
import importlib  
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, os.path.join(path_to_repo, 'tools'))

from utils import bb_intersection_over_union, compare_detected_to_annotation, plot_single_frame_with_outlines, gt_as_pd_df
from post_process_yolo_generator import PostProcessYoloGenerator

## Directories and data

The chunk below holds the information as to where our main dataset, images, and annotations directories are. We also load the classes file which maps the object classes to integers

In [None]:
DATASET_BASE_DIR = '/Users/guillaumekugener/Documents/USC/USC_docs/ml/datasets/'

# Specific dataset
DATASET_PATH = os.path.join(DATASET_BASE_DIR, 'large-clean-surgical-ds')
IMAGES_DIR = os.path.join(DATASET_PATH, 'JPEGImages')
ANNOTATION_DIR = os.path.join(DATASET_PATH, 'Annotations')

# Where we will save our output files and stats
OUTPUTS_FROM_DETECTION_DIR = os.path.join(DATASET_PATH, 'detection')

classes_file = os.path.join(DATASET_PATH, 'classes.name')

In [None]:
# gt_as_df = gt_as_pd_df(os.path.join(ANNOTATION_DIR), None)
# gt_as_df.to_csv(
#     os.path.join(DATASET_BASE_DIR, 'cvat_output', 'gt_labels.csv'),
#     sep=',', index=False)

In [None]:
classes_map = pd.read_csv(classes_file, sep='\t',header=None)
class_to_index = {}
reverse_index_to_class = []
for i, cn in enumerate(classes_map[0]):
    class_to_index[cn] = int(i)
    reverse_index_to_class.append(cn)

In [None]:
classes_map.shape[0]

In the chunk below, we keep a dictionary that defines the start and end frames of each of the videos that we are going to process.

TODO: do this programatically (based on the frames in our dataset)

In [None]:
dict_video_data_start_id = {
    'S306T1': 1730,
    'S306T2': 15140,
    'S611T1': 23990, # This is the one used for validation
    'S609T2': 20330,
#     'S303T1': 1640
}

dict_video_data_end_id = {
    'S306T1': 11100,
    'S306T2': 21060,
    'S611T1': 28800, # This is the one used for validation
    'S609T2': 22890,
#     'S303T1': 11100
}

In [None]:
training_videos = ['S306T1', 'S306T2', 'S609T2']
validation_videos = ['S611T1']

def make_frame_names(video_ids):
    all_frames = []
    for k in video_ids:
        for t in classes_map.index.values:
            all_frames += [k + '_frame_' + str(i).zfill(8) + '_tool_' + str(t) for i in range(dict_video_data_start_id[k], dict_video_data_end_id[k])]
    return all_frames

training_frames = make_frame_names(training_videos)
validation_frames = make_frame_names(validation_videos)


In [None]:
# for i in class_to_index:
#     print(class_to_index[i])
    
a = [0]*len(classes_map)
a

In [None]:
print(f"Number of training frames: {len(training_frames)}")
print(f"Number of validation frameS: {len(validation_frames)}")

## Preparation for training

In the chunk below, we use our custom data generator function to generate our training and validation data. We also specify the training parameters (batch size, sequence length, number of objects we are predicting, shuffling, etc)

TODO: the generator is quite slow in its current implementation (the focus was getting it to work in the first place). Would potentially be worthwhile to revisit its implementation to see if certain parts could be sped up. For example, I make use of for loops all over the place. There may be ways to take advantage of vectorized operations via numpy that could dramatically reduce compute time in data generation.

In [None]:
params = {
    'batch_size': 64, 
    'seq_len': 12, 
    'grid_size': 32, 
    'n_objects': classes_map.shape[0], 
    'out_n_objects': 5, 
    'shuffle': True
}
training_generator = PostProcessYoloGenerator(
    video_ids=dict_video_data_start_id, 
    video_data_dir=OUTPUTS_FROM_DETECTION_DIR,
    annotation_dir=ANNOTATION_DIR,
    frame_ids=training_frames,
    classes_map=class_to_index,
    **params)

validation_generator = PostProcessYoloGenerator(
    video_ids=dict_video_data_start_id, 
    video_data_dir=OUTPUTS_FROM_DETECTION_DIR,
    annotation_dir=ANNOTATION_DIR,
    frame_ids=validation_frames,
    classes_map=class_to_index,
    **params)

In [None]:
Xe, ye = training_generator.__getitem__(0)

## Model definition

The chunk below loads our model class and sets up the final components of our model architecture

In [None]:
import tensorflow as tf
from sequential_model import SequentialPostProcess, sequential_model_loss

This loss actually needs to be two part... (since once is classification and the other one is regression)

In [None]:
example_loss = sequential_model_loss(2)
obj_example = np.array([
    [0.9, 0.1, 0.1, 0.2, 0.2, 0.1, 0.2, 0.3, 0.4, 0.5],
    [0.3, 0.2, 0.2, 0.4, 0.4, 0.8, 0.15, 0.15, 0.25, 0.25]
])
true_example = np.array([
    [1, 0.12, 0.12, 0.22, 0.22, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
])

example_loss(true_example, obj_example)

In [None]:
model = SequentialPostProcess.build(
    num_seq =  training_generator.dim[0], 
    num_features = training_generator.dim[1], 
    output_shape = training_generator.out_dim * 5)


opt = tf.keras.optimizers.Adam(lr=0.0001, decay=1e-6)
model.compile(loss=sequential_model_loss(params['out_n_objects']), optimizer=opt)
# model.compile(loss='mean_squared_error', optimizer=opt)

In [None]:
model.summary()

### Here is where we actually perform our model training

We might want to rewrite this document as a training script. I am worried that if we move this to the cloud, we may lose connection and then training would fail. Having a script, we would be able to start a screen session and then not have to worry about dropping the ssh connection

In [None]:
history = model.fit(
    x=training_generator,
    validation_data=validation_generator,
    use_multiprocessing=True,
    workers=6,
    epochs=5
)

In [None]:
weights_dir = os.path.join(path_to_repo, 'checkpoints/models')
model.save(os.path.join(weights_dir, 'latest_lsm')) 
# model.save_weights(os.path.join(weights_dir, 'latest_lstm'))

## Evaluation

In the sections below, we perform our model evaluation. We run the prediction on our training data and save the outputs to a pandas dataframe, along with the yolo outputs and ground truth outputs. 

In [None]:
# This is not shuffled so the frames are in order
prediction_params = {
    'batch_size': 64, 
    'seq_len': 12, 
    'grid_size': 32, 
    'n_objects': 2, 
    'out_n_objects': 5, 
    'shuffle': False
}


frames_predicting_one = training_frames + validation_frames

prediction_training_generator = PostProcessYoloGenerator(
    video_ids=dict_video_data_start_id, 
    video_data_dir=OUTPUTS_FROM_DETECTION_DIR,
    annotation_dir=ANNOTATION_DIR,
    frame_ids=frames_predicting_one,
    classes_map=class_to_index,
    **prediction_params)


In [None]:
start_time = time.time()
training_predictions = model.predict(
    prediction_training_generator,
    use_multiprocessing=True,
    workers=6
)
print(f"Time for prediciton: {time.time()-start_time}")

In [None]:
dict_video_data_start_id = {
    'S306T1': 1730,
    'S306T2': 15140,
    'S611T1': 23990, # This is the one used for validation
    'S609T2': 20330,
    'S303T1': 1640
}

dict_video_data_end_id = {
    'S306T1': 11100,
    'S306T2': 21060,
    'S611T1': 28800, # This is the one used for validation
    'S609T2': 22890,
    'S303T1': 11100
}


unlabelled_frames = make_frame_names(['S303T1'])

In [None]:
from post_process_yolo_generator import PostProcessYoloDetectionGenerator
import copy

In [None]:
unlabelled_params = prediction_params.copy()
unlabelled_params['batch_size'] = 1
unlabelled_params.pop('shuffle')

# Get prediction for the unlabelled
unlabelled_generator = PostProcessYoloDetectionGenerator(
    video_ids={ 'S303T1': 1640 }, 
    video_data_dir=OUTPUTS_FROM_DETECTION_DIR,
    annotation_dir=ANNOTATION_DIR,
    frame_ids=unlabelled_frames,
    classes_map=class_to_index,
    **unlabelled_params)

In [None]:
len(unlabelled_frames)

In [None]:
Xe = unlabelled_generator.__getitem__(0)

In [None]:
start_time = time.time()
unlabelled_predictions = model.predict(
    unlabelled_generator
#     use_multiprocessing=True,
#     workers=1
)
print(f"Time for prediciton: {time.time()-start_time}")

In [None]:
unlabelled_predictions.shape

In the three chunks below, we create three data frames (yolo, lstm, gt) corresponding to the detected objects in each of these datasets. We will then combine this into a single data frame that we will then use in order to do our final analysis.

In [None]:
# Now save the outputs (video_id, tool, (score, bounding box)*n)
prediction_output_data = {}
pred_score_threshold = 0.05 # Ignore objects below this score to keep the output small
bb_col_names = ['score', 'x1', 'y1', 'x2', 'y2']


for i, row in enumerate(training_predictions):
    video_id = re.sub('_.*', '', frames_predicting_one[i])
    frame_id = re.sub('(.*_frame_)|(_tool.*)', '', frames_predicting_one[i])
    tool_id = reverse_index_to_class[int(re.sub('.*_tool_', '', frames_predicting_one[i]))]
    
    if 'video_id' not in prediction_output_data:
        for col in ['source', 'video_id', 'frame_id', 'tool_id'] + bb_col_names:
            prediction_output_data[col] = []
    
    # Now go through each bounding box and add it
    for bbi in range(prediction_params['n_objects']):
        bb = training_predictions[i,bbi*5:(bbi+1)*5]
                
        if bb[0] < pred_score_threshold:
            continue # Ignore this detected object
            
        prediction_output_data['video_id'].append(video_id)
        prediction_output_data['frame_id'].append(frame_id)
        prediction_output_data['tool_id'].append(tool_id)
        prediction_output_data['source'].append('lstm')
        
        
        for ci, col in enumerate(bb_col_names):
            prediction_output_data[col].append(bb[ci])

predicted_output_as_df = pd.DataFrame(prediction_output_data)

In [None]:
# Now save the outputs (video_id, tool, (score, bounding box)*n)
unlabelled_prediction_output_data = {}

for i, row in enumerate(unlabelled_predictions):
    video_id = re.sub('_.*', '', unlabelled_frames[i])
    frame_id = re.sub('(.*_frame_)|(_tool.*)', '', unlabelled_frames[i])
    tool_id = reverse_index_to_class[int(re.sub('.*_tool_', '', unlabelled_frames[i]))]
    
    if 'video_id' not in unlabelled_prediction_output_data:
        for col in ['source', 'video_id', 'frame_id', 'tool_id'] + bb_col_names:
            unlabelled_prediction_output_data[col] = []
    
    # Now go through each bounding box and add it
    for bbi in range(prediction_params['n_objects']):
        bb = unlabelled_predictions[i,bbi*5:(bbi+1)*5]
                
        if bb[0] < pred_score_threshold:
            continue # Ignore this detected object
            
        unlabelled_prediction_output_data['video_id'].append(video_id)
        unlabelled_prediction_output_data['frame_id'].append(frame_id)
        unlabelled_prediction_output_data['tool_id'].append(tool_id)
        unlabelled_prediction_output_data['source'].append('lstm')
        
        
        for ci, col in enumerate(bb_col_names):
            unlabelled_prediction_output_data[col].append(bb[ci])

unlabelled_prediction_output_data_as_df = pd.DataFrame(unlabelled_prediction_output_data)

In [None]:
# Now save the outputs (video_id, tool, (score, bounding box)*n)
yolo_output_data = {}
tools_of_interest = classes_map.index.values

for video_id in prediction_training_generator.data:
#     video_id = 'S306T1'
    for i, row in enumerate(prediction_training_generator.data[video_id]):
        frame_id = str(i + prediction_training_generator.video_offsets[video_id]).zfill(8)

        # Instantiate our output df
        if 'video_id' not in yolo_output_data:
            for col in ['source', 'video_id', 'frame_id', 'tool_id'] + bb_col_names:
                yolo_output_data[col] = []

        # Now go through each bounding box and add it
        for oi in range(row[3][0]): # Total bound boxes detected
            if row[1][0][oi] < pred_score_threshold:
                continue
            if int(row[2][0][oi]) not in tools_of_interest:
                continue

            yolo_output_data['video_id'].append(video_id)
            yolo_output_data['frame_id'].append(frame_id)
            yolo_output_data['tool_id'].append(reverse_index_to_class[int(row[2][0][oi])])
            yolo_output_data['score'].append(row[1][0][oi])
            yolo_output_data['source'].append('yolo')

            for ci, col in enumerate(bb_col_names[1:]):
                yolo_output_data[col].append(row[0][0][oi][ci])

yolo_output_data_as_df = pd.DataFrame(yolo_output_data)

In [None]:
# Now save the outputs (video_id, tool, (score, bounding box)*n)
unlabelled_yolo_output_data = {}

for video_id in unlabelled_generator.data:
    for i, row in enumerate(unlabelled_generator.data[video_id]):
        frame_id = str(i + unlabelled_generator.video_offsets[video_id]).zfill(8)

        # Instantiate our output df
        if 'video_id' not in unlabelled_yolo_output_data:
            for col in ['source', 'video_id', 'frame_id', 'tool_id'] + bb_col_names:
                unlabelled_yolo_output_data[col] = []

        # Now go through each bounding box and add it
        for oi in range(row[3][0]): # Total bound boxes detected
            if row[1][0][oi] < pred_score_threshold:
                continue
            if int(row[2][0][oi]) not in tools_of_interest:
                continue

            unlabelled_yolo_output_data['video_id'].append(video_id)
            unlabelled_yolo_output_data['frame_id'].append(frame_id)
            unlabelled_yolo_output_data['tool_id'].append(reverse_index_to_class[int(row[2][0][oi])])
            unlabelled_yolo_output_data['score'].append(row[1][0][oi])
            unlabelled_yolo_output_data['source'].append('yolo')

            for ci, col in enumerate(bb_col_names[1:]):
                unlabelled_yolo_output_data[col].append(row[0][0][oi][ci])

unlabelled_yolo_output_data_as_df = pd.DataFrame(unlabelled_yolo_output_data)

In [None]:
unlabelled_prediction_output_data_as_df.head()

In [None]:
all_three_datasets = pd.concat([
    yolo_output_data_as_df,
    predicted_output_as_df
])

all_three_datasets.to_csv(
    os.path.join(OUTPUTS_FROM_DETECTION_DIR, 'combined_call_boxes.csv'),
    sep=',', index=False)

unlabelled_all_datasets = pd.concat([
    unlabelled_prediction_output_data_as_df,
    unlabelled_yolo_output_data_as_df
])

unlabelled_all_datasets.to_csv(
    os.path.join(OUTPUTS_FROM_DETECTION_DIR, 'unlabelled_all_datasets.csv'),
    sep=',', index=False)

In [None]:
predicted_output_as_df.tail()

In [None]:
all_three_datasets[all_three_datasets['video_id']=='S306T1']

In [None]:
true_labels_data_as_df.shape

In [None]:
yolo_output_data_as_df.head()

Below, we generate labelled plots where for each image, we have the yolo, lstm, and gt bounding boxes drawn, in order to be able to visually compare a certain set of frames

In [None]:
frames_only = [f for f in set([re.sub('_tool.*', '', f) for f in frames_predicting_one])]
frames_only.sort()
colors_map = {
    'yolo': 'green',
    'lstm': 'red',
    'gt': 'blue'
}

out_spec_folder = 'yolo-lstm-gt'

for f in frames_only:
    matching_annotations = all_three_datasets[(all_three_datasets['video_id'] + '_frame_' + all_three_datasets['frame_id']) == f]
    matching_annotations = matching_annotations.reset_index()
    
    image_path = os.path.join(IMAGES_DIR, f + '.jpeg')
    img_array = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_RGB2BGR)
    
    fig,ax = plt.subplots(1)
    ax.imshow(img_array)
    
    # Draw the ground truth boxes
    for ri in range(len(matching_annotations)):
        coords = [
            matching_annotations.at[ri, 'x1'],
            matching_annotations.at[ri, 'y1'],
            matching_annotations.at[ri, 'x2'],
            matching_annotations.at[ri, 'y2']
        ]
        coords = [i  * img_array.shape[1] for i in coords]
        
        rect = Rectangle(
            (coords[0], coords[2]), 
            coords[1] - coords[0], 
            coords[3] - coords[2], 
            linewidth=1,edgecolor=colors_map[matching_annotations.at[ri,'source']],facecolor='none')

        ax.add_patch(rect) 
        
    plt.savefig(os.path.join(OUTPUTS_FROM_DETECTION_DIR, out_spec_folder, f + '.jpeg'))
    plt.close()