# Inspect the training data

In [None]:
from data import DiabeticRetinopathyDataset
import matplotlib.pyplot as plt
import torch
import numpy as np
import pandas as pd
from typing import Dict, Tuple


In [None]:
DATA_FOLDER = "data"
TRAIN_FOLDER = "train"
TRAIN_LABELS_CSV = "trainLabels.csv"

In [None]:
# Load the whole training set
DR_dataset = DiabeticRetinopathyDataset(
    TRAIN_LABELS_CSV,
    DATA_FOLDER,
    TRAIN_FOLDER,
)

In [None]:
label_count = np.unique(DR_dataset.labels, return_counts=True)
a = plt.bar(*label_count, data=label_count[1])
plt.xlabel("Diabetic Retinopathy (DR) severity level. 0 = no DR; 5 = proliferative DR")
plt.ylabel("no. of samples")
plt.title("Class distribution")

y_low, y_high = plt.ylim()
plt.ylim(y_low, y_high + y_high/25)
for i in range(len(label_count[0])):
    plt.text(i, label_count[1][i] + y_high/35, label_count[1][i], ha = 'center')

print(" label | count \n" + \
      "-------|-------")
for label, count in zip(*label_count):
    print(f"   {label}   | {count}  ") 

# plt.savefig("media/class_distribution_resampled.pdf")
plt.show()

## Create a reduced, balanced dataset to allow fast, iterative training

In [None]:
"""
    Create a reduced dataset to train on. Sample 700 samples from every class.
    Since the data is already shuffled, we can just use the first 700 samples.
"""
indices = np.arange(len(DR_dataset.labels))
unique_labels = label_count[0]
new_data_set = np.array([], dtype=int)
for label in unique_labels:
    mask = DR_dataset.labels == label
    indices_for_class_i = indices[mask]
    first_700 = indices_for_class_i[:700]
    new_data_set = np.append(new_data_set, first_700)
    
new_data_set.sort()

indices, unique_labels, new_data_set, len(new_data_set)

In [None]:
import csv

# with open('reducedTrainLabels.csv', 'w', newline='') as file:
#     writer = csv.writer(file)
    
#     writer.writerow(["image", "level"])
#     for item, label in zip(DR_dataset.items[new_data_set], DR_dataset.labels[new_data_set]): 
#         writer.writerow([item, label])

In [None]:
DATA_FOLDER = "data"
TRAIN_FOLDER = "train"
TRAIN_LABELS_CSV = "reducedTrainLabels.csv"

In [None]:
# Load the reduced training set
DR_dataset_reduced = DiabeticRetinopathyDataset(
    TRAIN_LABELS_CSV,
    DATA_FOLDER,
    TRAIN_FOLDER,
)

In [None]:
label_count = np.unique(DR_dataset_reduced.labels, return_counts=True)
plt.bar(*label_count)
plt.xlabel("Diabetic Retinopathy (DR) severity level. 0 = no DR; 5 = proliferative DR")
plt.ylabel("no. of samples")
plt.title("Class distribution")

y_low, y_high = plt.ylim()
plt.ylim(y_low, y_high + y_high/25)
for i in range(len(label_count[0])):
    plt.text(i, label_count[1][i] + y_high/35, label_count[1][i], ha = 'center')

print(" label | count \n" + \
      "-------|-------")
for label, count in zip(*label_count):
    print(f"   {label}   | {count}  ") 
