# Data Cleaning

#### This notebook is for visualize the images and manually label them as valid or not. The images can be invalid if they have artifacts or if the labeled view is not correct.

## Imports

In [None]:
import os
import random

import pandas as pd
from skimage import exposure
import ipywidgets as widgets
from IPython.display import display

from lib.plot_utils import show_images
from lib.image_processing import load_numpy_data

## Config

In [None]:
# Path to the folder with the subjects folders extracted
subjects_path = "../../../datasets/BIMCV-COVID19-cIter_1_2/covid19_posi/"

# Path to the TSV with all the images file pahts by subject and session
partitions_tsv_path = os.path.join(subjects_path, "derivatives/partitions.tsv")

# Path to the TSV with the images selected to create the ECVL dataset (These images are preprocessed)
preproc_ecvl_dataset = os.path.join(subjects_path, "ecvl_bimcv_covid19.tsv")

# Path of the output file to generate with the manual filtering
output_file = os.path.join(subjects_path, "data_cleaning.tsv")

# Path to an output file of a previous run to use as a checkpoint
#   Note: If not found a new file will be created from scratch
ckpt_file = os.path.join(subjects_path, "data_cleaning.tsv")

## Prepare the output file

In [None]:
if os.path.isfile(ckpt_file):
    print(f"Going to load the checkpoint file from {ckpt_file}")
    out_df = pd.read_csv(ckpt_file, sep='\t')
else:
    print("Checkpoint file not found. Going to create a new file.")
    out_df = pd.DataFrame(columns=["subject", "session", "status"])  # status can be: "OK", "WRONG VIEW", "ARTIFACTS", "INVALID"

## Select the set of images to clean

In [None]:
use_all = False  # If True: You will use all the "partitions_tsv_path" images filtered by the view AP/PA
                 # elif False: You will use the images of "preproc_ecvl_dataset", which are already filtered
    
if use_all:
    # Load dataframe with all the images by session and subject
    cols = ["subject", "session", "filepath"]  # The original columns must be fixed
    df = pd.read_csv(partitions_tsv_path, sep="\t", header=0, names=cols)
    
    df = df.loc[~df['session'].isin(out_df["session"])]  # Don't take the samples that are already labeled
    
    # Filter the images to get only AP or PA views
    images_data = []
    for idx, row in df.iterrows():
        if "vp-ap" in row["filepath"] or "vp-pa" in row["filepath"]:  # Filter by view
            images_data.append((row["subject"], row["session"], row["filepath"]))
        
else:
    # Load the dataframe of the preprocessed dataset
    df = pd.read_csv(preproc_ecvl_dataset, sep="\t")
    
    df = df.loc[~df['session'].isin(out_df["session"])]  # Don't take the samples that are already labeled
    
    # Get the paths
    images_data = [(row["subject"], row["session"], row["filepath"]) for idx, row in df.iterrows()]
    
# Get the full path. "relative_paths" are relative to the main data folder
images_data = [(sub, sess, os.path.join(subjects_path, path)) for sub, sess, path in images_data]


In [None]:
if len(images_data) == 0:
    raise Exception("There are no images to show!")
    
current_image = 0  # To track the index of the current image to show

SUB, SESS, PATH = 0, 1, 2  # Auxiliay indexes

# Create output widgets to control the layout
output = widgets.Output()  # To show the images
log_out =  widgets.Output()  # To show the dataframe status

def show_image(index):
    """Shows the image selected by the index provided"""
    output.clear_output()
    with output:
        show_images([load_numpy_data(images_data[index][PATH])])
        
show_image(current_image)  # Show the first image

# Log dataframe status
log_out.clear_output()
with log_out:
    print(out_df.tail(20))

# Create the buttons to label the images
buttons_layout = widgets.Layout(width='200px', height='100px')
b_next = widgets.Button(description="Next", button_style="info", layout=buttons_layout, icon='arrow-right')
b_inval = widgets.Button(description="Invalid", button_style="danger", layout=buttons_layout, icon='times')
b_arti = widgets.Button(description="Artifacts", button_style="warning", layout=buttons_layout, icon='times')
b_view = widgets.Button(description="Wrong View", button_style="danger", layout=buttons_layout, icon='times')
b_ok = widgets.Button(description="OK", button_style="success", layout=buttons_layout, icon='check')


# Prepare the function callbacks for the buttons to label the current image

def set_label(label=None):
    global current_image
    
    if label:
        global out_df
        sub = images_data[current_image][SUB]
        sess = images_data[current_image][SESS]
        path = images_data[current_image][PATH]

        # Add the label to the dataframe
        out_df = out_df.append({"subject": sub, "session": sess, "status": label}, ignore_index=True)
    
    # Log dataframe status
    log_out.clear_output()
    with log_out:
        print(out_df.tail(20))
    
    # Pass to the next image
    current_image += 1
    if current_image < len(images_data):
        show_image(current_image)
    else:
        b_next.disabled = True
        b_inval.disabled = True
        b_arti.disabled = True
        b_view.disabled = True
        b_ok.disabled = True
        with log_out:
            print("\nYOU REACHED THE END OF THE SAMPLES LIST!")
            print("Save the TSV file with the next cell of the notebook")
    
def is_next(arg):
    set_label(None)

    
def is_invalid(arg):
    set_label("INVALID")  

def is_artifacts(arg):
    set_label("ARTIFACTS") 

def is_wrong_view(arg):
    set_label("WRONG VIEW")
    
def is_ok(arg):
    set_label("OK")
        
# Assign the buttons callbacks
b_next.on_click(is_next)
b_inval.on_click(is_invalid)
b_arti.on_click(is_artifacts)    
b_view.on_click(is_wrong_view)
b_ok.on_click(is_ok)
    
# Configure the layout of the elements and show it
buttons = widgets.VBox([b_next, b_inval, b_arti, b_view, b_ok])
display(widgets.HBox([output, buttons, log_out]))

## Save the TSV with the labeled samples

In [None]:
out_df.to_csv(output_file, sep='\t', index=False)