# **(Data Collection)**

## Objectives

* To Fetch data from Kaggle and prepare it for further processing

## Inputs

* Kaggle JSON file - authentication token

## Outputs

* Generate Dataset: inputs/datasets/cherry_mildew_dataset

## Additional Comments

* No comment



---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import numpy
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Section 1

Section 1 content

In [None]:
# Install Kaggle Package
!pip install kaggle

---

# Section 2

Section 2 content

In [None]:
# Create directory for Kaggle files
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

--- set Kaggle dataset and download it

In [None]:
KaggelDatasetPath = "codeinstitute/cherry-leaves/"
DestinationFolder = "inputs/cherry_mildew_dataset"
! kaggle datasets download -d {KaggelDatasetPath} -p {DestinationFolder}

NOTE

In [None]:
! unzip {DestinationFolder}/*.zip -d {DestinationFolder} \
 && rm {DestinationFolder}/*.zip

<hr>

# Data Preparation

---

## Data Cleaning

In [None]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        # print(files)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [None]:
remove_non_image_file(my_data_dir='inputs/cherry_mildew_dataset/cherry-leaves')

## Split train validation test set

In [None]:
# This code snippet was adapted from Code Institue Malaria Detector Walkthrough Sample Project
# https://github.com/Code-Institute-Solutions/WalkthroughProject01/blob/main/jupyter_notebooks/01%20-%20DataCollection.ipynb

import os
import shutil
import random
import joblib

def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # Gets class labels
    labels = os.listdir(my_data_dir)  # retrieve the folder names

    if 'test' in labels:
        pass
    else:
        # Create train, validation, and test folders with class label sub-folders
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=os.path.join(my_data_dir, folder, label))

        for label in labels:
            files = os.listdir(os.path.join(my_data_dir, label))
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # Move a given file to the train set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'train', label, file_name))

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # Move a given file to the validation set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'validation', label, file_name))

                else:
                    # Move a given file to the test set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'test', label, file_name))

                count += 1

            os.rmdir(os.path.join(my_data_dir, label))


In [None]:
split_train_validation_test_images(my_data_dir=f"inputs/cherry_mildew_dataset/cherry-leaves",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

# Push files to Repo

* If you don't need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.