# Loading Data

In [None]:
from torch.utils.data import Dataset, DataLoader

### Removing Label Studio IDs from Label Files

In [None]:
# Remove specific ids from labels
import os

LABEL_PATH = '../data/unzipped/labels'

RENAME = False

if RENAME:
    for file_name in os.listdir(LABEL_PATH):
        file_path = os.path.join(LABEL_PATH, file_name)
        new_file_name = file_name.split('-')[1]

        #print(f'Renamed {file_name} to {new_file_name}')
        os.rename(file_path, os.path.join(LABEL_PATH, new_file_name))

### Mapping Classes

In [None]:
import pandas as pd
import json

# It is expected that there's a JSON file with the following format:
# {"categories": [{"id": 1, "name": "class_name"}, ...], ...}
JSON_MAPPING_PATH = '../data/unzipped/notes.json'

class_map = [None]

with open(JSON_MAPPING_PATH, 'r') as file:
    class_list = json.loads(file.read())['categories']
    for class_mapping in class_list:
        class_map.append(class_mapping['name'])


class_map

### Making Dataset

In [None]:
import os
import PIL.Image
import numpy as np
import torchvision.transforms as ttran

class PressureUlcers(Dataset):

    def __init__(self, images_path, labels_path, transform):
        """
        - *images_path*: folder with each image to load.
        - *labels_path*: folder with .txt files with corresponding labels. Each file must be named exactly like its counterpart.
        """
        super().__init__()
        # We need to load everything in a specific format, appropriate for YOLO
        # We'll store only the img paths to use and labels.

        self.transform = transform
        self.X = []
        self.y = []

        # For each folder
        for folder_name in os.listdir(images_path):

            # Get image paths
            folder_files = os.listdir(os.path.join(images_path, folder_name))

            # For each image, get its corresponding labels (it can have multiple)
            for file_name in folder_files:
                self.X.append(os.path.join(images_path, folder_name, file_name))
                file_name = f'{file_name.split('.')[0]}.txt'

                 # If it's a classified folder
                if folder_name != 'Invalid':
                    # Read file
                    label_file_path = os.path.join(labels_path, file_name)
                    data = pd.read_csv(label_file_path, sep=' ', header=None).to_numpy()
                    self.y.append(data)
                # Otherwise, insert an empty label
                else:
                    self.y.append(np.array([]))

            print(f'folder: {folder_name}\nfiles: {folder_files}\n\n')

    def __getitem__(self, index):
        """
        Returns, for **index**:
            PIL_img: unaltered PIL image.
            transformed_img: tensor of transformed PIL image.
            labels: list of numpy arrays, each with format [class, x, y, width, height].
        """
        pil_img = PIL.Image.open(self.X[index])
        return pil_img, self.transform(pil_img), self.y[index]

    def __len__(self):
        return len(self.X)

transform = ttran.PILToTensor()

ds = PressureUlcers('../data/images', '../data/unzipped/labels', transform)

## Testing Specific Index

In [None]:
pil_img, tensor_img, labels = ds[0]

## Displaying Index Data

In [None]:
from display import display_img_boxes

display_img_boxes(pil_img, labels, class_map)

## Data Analysis

In [None]:
frequencies = {}

for i in range(len(ds)):
    _, _, labels = ds[i]
    for label in labels:
        class_id = class_map[int(label[0])]
        if class_id not in frequencies.keys():
            frequencies[class_id] = 1
        else:
            frequencies[class_id] += 1

frequencies

In [None]:
import matplotlib.pyplot as plt

plt.barh(frequencies.keys(), frequencies.values())