# **Data Collection**

## Objectives

* Fetch data from Kaggle and prepare it for further processes.

## Inputs

*   Kaggle JSON file - the authentication token. 

## Outputs

* Generate Dataset: inputs/datasets/leaves_dataset

## Additional Comments

* We will also create a seperate directory src/montage_images with a few of the test set images downsized to 200x200


---

# Import packages

In [None]:
%pip install -r requirements.txt

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Installing Kaggle

In [None]:
# install kaggle package
%pip install kaggle

Run the cell below **to change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON**.

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

- We will now download the data from kaggle. Our data path is 'warcoder/potato-leaf-disease-dataset' (the last part of the [URL](https://www.kaggle.com/datasets/warcoder/potato-leaf-disease-dataset))

- **Make sure you have your Kaggle.json file in the directory**

In [None]:
KaggleDatasetPath = "warcoder/potato-leaf-disease-dataset"
DestinationFolder = "inputs/leaves_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

We will now unzip the downloaded file, and delete the zip file.

In [8]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/potato-leaf-disease-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/potato-leaf-disease-dataset.zip')

In [9]:
os.rename(DestinationFolder + '/Potato Leaf Disease Dataset in Uncontrolled Environment', DestinationFolder + '/leaf_images')

---

# Data preperation

## Data cleaning

### remove non image files

In [10]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))


In [None]:
remove_non_image_file(my_data_dir='inputs/leaves_dataset/leaf_images')

In [12]:
import shutil

def join_folders(folders, new_folder_name):
    my_data_dir = 'inputs/leaves_dataset/leaf_images'
    os.makedirs(name=my_data_dir + '/' + new_folder_name)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        for file in files:
            shutil.copy(my_data_dir + '/' + folder + '/' + file, my_data_dir + '/' + new_folder_name + '/' + file)
        shutil.rmtree(my_data_dir + '/' + folder)

In [13]:
folders = ['Virus', 'Pest', 'Nematode', 'Fungi']
new_folder_name = 'Others'
join_folders(folders, new_folder_name)
shutil.rmtree('inputs/leaves_dataset/leaf_images/Others')


## Split train validation test set

In [14]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.seed(42)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)


We will be opting for a distribution of:
* Training: 0.70 of data.
* Validation: 0.10 of data.
* Test: 0.20 of data.

This is a conventional distribution

In [15]:
split_train_validation_test_images(my_data_dir=f"inputs/leaves_dataset/leaf_images",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

---

**There should now be an 'input/leaves_dataset/leaf_images' folder containing a train, test and validation folder**

---

## Create a seperate file to use for image montage in the dashboard

In [21]:
import shutil
import math
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def montage_images(src_folder, dest_folder, target_count, label_folder):
    images = os.listdir(src_folder + '/' + label_folder)
    oversampleimggen = ImageDataGenerator()
    
    save_prefix = label_folder
    aug_gen=oversampleimggen.flow_from_directory(src_folder, target_size= (200, 200), batch_size=1, shuffle=True, seed=42, classes=[label_folder],
                                    save_to_dir=dest_folder, save_prefix=save_prefix, color_mode='rgb',
                                    save_format='jpg')

    img_count = 0
    while img_count < target_count:
        image = next(aug_gen)
        img_count += 1

In [None]:
os.makedirs(name='src/montage_images', exist_ok=True)

label_folders = os.listdir('inputs/leaves_dataset/leaf_images/validation')
target_count = 20
for label_folder in label_folders:
    dest_folder = 'src/montage_images/' + label_folder
    os.makedirs(name=dest_folder, exist_ok=True)
    label_folder_path = 'inputs/leaves_dataset/leaf_images/validation/'
    montage_images(label_folder_path, dest_folder, target_count, label_folder)