# Label Processing

## Configuration

The total dataset will be trimmed to be more consistant and understandable for the model. `max_diff` will note the maximum difference between the ed and es frames. The `buffer` variable specifies how much frames are required before the ed and after the es. Additionally, `linear_arrangement` specifies if the es has to come after ed.

In [2]:
min_len = 50
max_diff = 25
buffer = 25
linear_arrangement = True

## Package Imports

In [3]:
import os.path
import numpy as np
import pandas as pd
from IPython.display import clear_output

from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


## Dataset Imports

In [4]:
samples_dir = "/content/gdrive/My Drive/Colab Notebooks/project/dataset/raw"

raw_label_path = f"{samples_dir}/volume_tracings.csv"
sample_data_path = f"{samples_dir}/file_list.csv"

labels = pd.read_csv(raw_label_path)
file_attributes = pd.read_csv(sample_data_path)

## Path Helper Functions

In [5]:
# given a file path, return the path to directory
def _dir(path):
  inv_path = path[::-1]
  inv_path = inv_path[inv_path.index('/')+1:]
  return inv_path[::-1]
  
# given a file path, return the file name
def _file(path):
  inv_path = path[::-1]
  inv_path = inv_path[:inv_path.index('/')]
  return inv_path[::-1]

# given a full file name, return without file type
def _typelessFile(path):
  inv_path = path[::-1]
  inv_path = inv_path[:inv_path.index('/')]
  nor_path = inv_path[::-1]
  nor_path = nor_path[:nor_path.index('.')]
  return nor_path

def _fileType(path):
  inv_path = path[::-1]
  nor_path = inv_path[:inv_path.index('.')]
  return "." + nor_path[::-1]

# search for the labels for the sample
def find_label(labels, file_name):
  search_result = labels.loc[labels['FileName'] == file_name]
  return search_result

## Generating Information Per Sample

In [6]:
data = []
progress = 0
rows_amount = file_attributes.shape[0]

for index, row in file_attributes.iterrows():
  # progress counter
  current_progress = int(((index+1)/rows_amount)*100)
  if current_progress != progress:
    progress = current_progress
    clear_output()
    print(f"progress: {progress}% (approx. sample {index+1}/{rows_amount})")

  # finding file name
  file_name_avi = row["FileName"]
  file_name = _typelessFile("/" + file_name_avi)
  
  # finding file type
  file_type = _fileType(file_name_avi)

  # finding number of frames
  frames = row["NumberOfFrames"]

  # finding ed and es heart phases
  seq_label = find_label(labels, file_name)
  if seq_label.size == 0:
    continue
  all_results = np.array(seq_label)
  all = []
  for row in all_results:
    all.append(row[5])
  unique = list(set(all))
  ed, es = unique[0], unique[1]
  diff = es - ed

  # performing checks
  if linear_arrangement and diff <= 0:
    continue
  if diff > max_diff:
    continue
  if ed < buffer:
    continue
  if (es + buffer) > frames:
    continue
  if (ed + diff + (frames - es)) < min_len:
    # i.e. (pre-ed + es-ed + post-ed) < min_len
    continue
  
  entry = [samples_dir + "/", file_name, file_type, frames, ed, es, diff]
  data.append(entry)

progress: 100% (approx. sample 10030/10030)


## Shuffling Data Rows

In [7]:
data = pd.DataFrame(data).sample(frac=1).reset_index(drop=True).values.tolist()

## Adding Columns

In [8]:
df = pd.DataFrame(data, columns=['directory', 'file_name', 'extension', 'frames', 'ed', 'es', 'diff'])

## Sanity Check

In [9]:
df

Unnamed: 0,directory,file_name,extension,frames,ed,es,diff
0,/content/gdrive/My Drive/Colab Notebooks/proje...,0X71BB6DF9BBBA718F,.avi,197,52,69,17
1,/content/gdrive/My Drive/Colab Notebooks/proje...,0X45C47B88561016CD,.avi,172,112,133,21
2,/content/gdrive/My Drive/Colab Notebooks/proje...,0X7204B716FE2009EA,.avi,122,32,44,12
3,/content/gdrive/My Drive/Colab Notebooks/proje...,0X3821E9F21EBFFF33,.avi,209,56,68,12
4,/content/gdrive/My Drive/Colab Notebooks/proje...,0X7498BD710B94CF12,.avi,175,84,103,19
...,...,...,...,...,...,...,...
4001,/content/gdrive/My Drive/Colab Notebooks/proje...,0X43664BF0CDA9C803,.avi,318,194,213,19
4002,/content/gdrive/My Drive/Colab Notebooks/proje...,0X633F7FD1EA0A3F2C,.avi,201,52,68,16
4003,/content/gdrive/My Drive/Colab Notebooks/proje...,0X11BDF610427B903F,.avi,161,42,58,16
4004,/content/gdrive/My Drive/Colab Notebooks/proje...,0X19E82707BBBA6452,.avi,277,112,132,20


## Exporting File

In [10]:
df.to_csv(samples_dir+"/raw_labels.csv")