<a href="https://colab.research.google.com/github/danmenloz/LeafWilting/blob/main/Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create Dataset structure
Run this script to build the dataset directory

In [1]:
# Libraries
import os
import shutil
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.utils import class_weight
import random

In [2]:
# Useful functions
def loadImages(path):
    # list with paths of all images in the folder
    f_paths = sorted([os.path.join(path, file)
                          for file in os.listdir(path)
                          if file.endswith('.jpg')])
    # list with file names of the images
    f_names = [ os.path.basename(i) for i in f_paths ]
    # list with images
    # images = [ plt.imread(i) for i in f_paths ] # don't load images!

    return f_names


def loadAnnotations(path):
  table = pd.read_csv(path)
  f_names = [x[0] for x in table.values] #read colum 0
  annotations = np.asarray(table[['annotation']]).ravel()
  return f_names, annotations
 

def buildDataset(f_names, annotations, img_names):
  img_set = [] # empty list
  for i in range(len(f_names)):
    img_dict = {} # empty dictionary
    try:
      # find image name in annotation file
      idx = f_names.index(img_names[i])
      # create dictionary and add it to the list
      img_dict['file_name'] = f_names[i]
      img_dict['annotation'] = annotations[i]
      # img_dict['image'] = images[i]
      img_set.append(img_dict)
    except:
      print("Image " + img_names[i] + " not found in annotations file!")
  
  return img_set


# # Histogram equalization
# # https://stackoverflow.com/questions/31998428/opencv-python-equalizehist-colored-image
# def histEqualization(dataset):
#   equ_dataset = dataset # copy input dataset
#   for n in range(len(dataset)):
#     # convert to YUV
#     img_yuv = cv2.cvtColor(dataset[n]['image'], cv2.COLOR_BGR2YUV)
#     # equalize the histogram of the Y channel
#     img_yuv[:,:,0] = cv2.equalizeHist(img_yuv[:,:,0])
#     # convert the YUV image back to RGB format
#     equ_dataset[n]['image']  = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR)
#   return equ_dataset


def countClasses(dataset, _print=True):
  labels = np.asarray( [d['annotation'] for d in dataset] )
  count = [ np.sum(labels == i) for i in range(5) ]
  if (_print):
    for i in range(len(count)):
      print("Class %d: %d" %(i,count[i]))
    print(" Total :", len(labels))
    print("") # enter
  return count


def countLabels(labels):
  print("Class 0:", np.sum(labels == 0))
  print("Class 1:", np.sum(labels == 1))
  print("Class 2:", np.sum(labels == 2))
  print("Class 3:", np.sum(labels == 3))
  print("Class 4:", np.sum(labels == 4))
  print(" Total :", len(labels))
  print("") # enter

In [3]:
#Access data in Google Drive
from google.colab import drive
drive.mount('/content/drive')
#drive.flush_and_unmount()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#Retrieve original images
data_path = "drive/My Drive/ECE_542-Project_1/TrainingData"
img_names = loadImages(data_path)

# Get annotation csv file
annotations_path = os.path.join(data_path, 'Annotations.csv')
f_names, labels = loadAnnotations(annotations_path)

# Build dictionary with data
dataset = buildDataset(f_names, labels, img_names)
print("Initial count")
countClasses(dataset)


Initial count
Class 0: 488
Class 1: 329
Class 2: 130
Class 3: 131
Class 4: 197
 Total : 1275



[488, 329, 130, 131, 197]

In [5]:
# Build Test dataset

# Suffle dataset
random.shuffle(dataset)

print("Before Split")
countClasses(dataset)

# Extract the same number of samples for the test dataset
n_samples = 20;
test_dataset = [] # start with empty set
for a in range(5):
  cnt = 0
  for i in dataset:
    if i['annotation']==a and cnt<n_samples:
      test_dataset.append(i)
      cnt += 1
# Update dataset
dataset = [i for i in dataset if i not in test_dataset]

print("After Split")
countClasses(dataset)

Before Split
Class 0: 488
Class 1: 329
Class 2: 130
Class 3: 131
Class 4: 197
 Total : 1275

After Split
Class 0: 468
Class 1: 309
Class 2: 110
Class 3: 111
Class 4: 177
 Total : 1175



[468, 309, 110, 111, 177]

In [6]:
#Display an element from dataset
sampleID = 19
# plt.imshow(dataset[sampleID]['image'])
print(dataset[sampleID])

{'file_name': '007990.jpg', 'annotation': 1}


In [7]:
# Build Training and Validation datasets

#Keeping 30% for Validation
valid_set_pc = 0.3

# # Unbalanced dataset
# count = countClasses(dataset, _print=False)
# validation_dataset = []
# training_dataset = []
# for i in range(len(count)):
#   split = int(count[i]*(1-valid_set_pc))
#   class_dataset = [ d for d in dataset if d['annotation']==i ]
#   validation_dataset.extend( class_dataset[split:] )
#   training_dataset.extend( class_dataset[:split] )

# Balanced dataset
count = countClasses(dataset, _print=False)
n_samples = int(np.sum(count)*valid_set_pc/5);
validation_dataset = [] # start with empty set
for a in range(5): # 5 classes
  cnt = 0
  for i in dataset:
    if i['annotation']==a and cnt<n_samples:
      validation_dataset.append(i)
      cnt += 1
# Next line does: training_dataset = dataset - validation_dataset
training_dataset = [i for i in dataset if i not in validation_dataset]

print("Validation dataset")
countClasses(validation_dataset)
print("Training dataset")
countClasses(training_dataset)

Validation dataset
Class 0: 70
Class 1: 70
Class 2: 70
Class 3: 70
Class 4: 70
 Total : 350

Training dataset
Class 0: 398
Class 1: 239
Class 2: 40
Class 3: 41
Class 4: 107
 Total : 825



[398, 239, 40, 41, 107]

In [8]:
# Compute class weights. Useful for unbalanced datasets
trainY = [ d['annotation'] for d in training_dataset ]
c_weights = class_weight.compute_class_weight('balanced',
                                               np.unique(trainY),
                                               trainY)
# Create dictionary
s_weights = { i : c_weights[i] for i in range(0, len(c_weights.tolist()) ) }
print(s_weights)

# Normalize weights
norm_weights = dict(zip(np.unique(trainY), c_weights/c_weights.sum()))
print(norm_weights)


{0: 0.41457286432160806, 1: 0.6903765690376569, 2: 4.125, 3: 4.024390243902439, 4: 1.5420560747663552}
{0: 0.03839919116004359, 1: 0.06394509657613953, 2: 0.3820719520424337, 3: 0.3727531239438378, 4: 0.14283063627754533}


In [9]:
# Save dataset to the respective directories

base_path = "drive/My Drive/ECE_542-Project_1/"

# create directory structure
try:
  path = os.path.join(base_path, 'data')
  if not os.path.isdir(path):
    os.makedirs(path)
    print(path)

  datasets_paths = [ 'data/test', 'data/train', 'data/validation' ]

  for p in datasets_paths:
    path = os.path.join(base_path, p)
    if not os.path.isdir(path):
      os.makedirs(path)
      print(path)

    for c in range(5): # 5 classes
      path = os.path.join(base_path, p + '/' + str(c) )
      if not os.path.isdir(path):
        os.makedirs(path)
        print(path)
  
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directories")

# copy images to the respective folders

# test_data_path = os.path.join(base_path, 'TestData')
training_data_path = os.path.join(base_path, 'TrainingData')
datasets_paths = [ os.path.join(base_path,'data/test/'),
                   os.path.join(base_path,'data/train/'),
                   os.path.join(base_path,'data/validation/') ] 
datasets = [test_dataset, training_dataset, validation_dataset]

try: 
  for i in range(len(datasets)):
    for d in datasets[i]:
      file_path = os.path.join(training_data_path, d['file_name'])
      dest_path = os.path.join(datasets_paths[i], str(d['annotation']))
      shutil.copy(file_path , dest_path)
      print("%s -> %s" %(file_path, dest_path))
  
except Exception as e:
    print ("Couldn't copy files")
    print(e)
else:
    print ("Successfully copied files")  

drive/My Drive/ECE_542-Project_1/data
drive/My Drive/ECE_542-Project_1/data/test
drive/My Drive/ECE_542-Project_1/data/test/0
drive/My Drive/ECE_542-Project_1/data/test/1
drive/My Drive/ECE_542-Project_1/data/test/2
drive/My Drive/ECE_542-Project_1/data/test/3
drive/My Drive/ECE_542-Project_1/data/test/4
drive/My Drive/ECE_542-Project_1/data/train
drive/My Drive/ECE_542-Project_1/data/train/0
drive/My Drive/ECE_542-Project_1/data/train/1
drive/My Drive/ECE_542-Project_1/data/train/2
drive/My Drive/ECE_542-Project_1/data/train/3
drive/My Drive/ECE_542-Project_1/data/train/4
drive/My Drive/ECE_542-Project_1/data/validation
drive/My Drive/ECE_542-Project_1/data/validation/0
drive/My Drive/ECE_542-Project_1/data/validation/1
drive/My Drive/ECE_542-Project_1/data/validation/2
drive/My Drive/ECE_542-Project_1/data/validation/3
drive/My Drive/ECE_542-Project_1/data/validation/4
Successfully created the directories
drive/My Drive/ECE_542-Project_1/TrainingData/004422.jpg -> drive/My Drive/ECE_