# Fruit Identification

In [1]:
# import libraries
import os, shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


# 1.1 Organize Data: Combine The Data

First I will take all the images from the training and test set and put them into one set

In [11]:
# collect all data into one set, the "Complete_Set"
try:
    os.mkdir("Complete_Set")

# iterate over train and test sets
for set_folder in ["Training", "Test"]:
    
    # iterate over all the different folders for fruit in both training and test sets
    for fruit_folder in os.listdir(set_folder):
        
        # create a new folder in the complete set everytime a new fruit is seen
        if fruit_folder not in os.listdir("Complete_Set"):
            os.mkdir("Complete_Set\\" + fruit_folder)
        
        # iterate over every image 
        for img in os.listdir(set_folder + "\\" + fruit_folder):
        
            # save the names of the old and new file paths
            origin = set_folder + "\\" + fruit_folder + "\\" + img
            destination = "Complete_Set\\" + fruit_folder + "\\" + img
            
            # copy the files into the complete set folder
            shutil.copyfile(origin, destination)

# 1.2 Organize Data: Train/Validation/Test Split
Now I will do a 70/15/15 split into train, validation, and test sets for each unique fruit

In [49]:
# create a helper function for counting images in a fruit folder
def img_count(fruit_folder):
    n_imgs = len(os.listdir(fruit_folder))
    return n_imgs

# create a helper function for finding the correct indexes to split on
# based on the number of images in a fruit folder
def find_indexes(n_imgs):
    import numpy as np
    
    train_index = int(np.ceil(0.7 * n_imgs))
    val_index = int(0.85 * n_imgs)
    
    return train_index, val_index

# create a function to split a fruit folder
def split_fruit(fruit_folder, train_directory, val_directory, test_directory):
    
    # find number of images in fruit folder
    n_imgs = img_count("Complete_Set\\" + fruit_folder)
    
    # find indexes to split on
    train_index, val_index = find_indexes(n_imgs)
    
    # create a list of all the images in the folder
    imgs = [img for img in os.listdir("Complete_Set\\" + fruit_folder)]
    
    # shuffle images to ensure randomness
    np.random.shuffle(imgs)
    
    # iterate over images
    for img in imgs:
        
        # save file path origin for image
        origin = "Complete_Set\\" + fruit_folder + "\\" + img
        
        # instantiate a variable for the destination file path
        destination = ""
        
        # based on the index find the correct destination
        if img in imgs[ : train_index]:
            destination = train_directory + "\\" + fruit_folder +"\\" + img
        elif img in imgs[train_index : val_index]:
            destination = val_directory + "\\" + fruit_folder +"\\" + img
        else:
            destination = test_directory + "\\" + fruit_folder +"\\" + img
            
        # copy image to the correct destination
        shutil.copyfile(origin, destination)

In [None]:
# iterate over all the unique fruits in the complete data set
for fruit_folder in os.listdir("Complete_Set"):
    
    # iterate over the three new sets: train, validation, and test
    for subset in ["new_train", "new_validation", "new_test"]:
        
        # if not done already, create a new directory for each set
        try:
            os.mkdir(subset)
        except:
            pass
        
        # if there is not one already, make a new directory for the unique fruit
        try:
            os.mkdir(subset + "\\" + fruit_folder)
        except:
            pass
    
    # copy the images over and do the train/val/test split
    split_fruit(fruit_folder, "new_train", "new_validation", "new_test")