## Data Collection

- Import
- Set working directory
- Get data from Kaggle

# Import packages

In [20]:
! pip install -r /workspace/PP5/requirements.txt



In [21]:
import numpy
import os

# Change working directory

In [22]:
current_dir = os.getcwd()
current_dir

'/workspace/PP5'

In [23]:
os.chdir('/workspace/PP5')
print("You set a new current directory")

You set a new current directory


In [24]:
current_dir = os.getcwd()
current_dir

'/workspace/PP5'

# Install Kaggle

In [25]:
pip install kaggle

Note: you may need to restart the kernel to use updated packages.


Run the cell below **to change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON**.

In [26]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

chmod: cannot access 'kaggle.json': No such file or directory


Set the Kaggle Dataset and Download it.

In [27]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherryleaves_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Dataset URL: https://www.kaggle.com/datasets/codeinstitute/cherry-leaves
License(s): unknown
Downloading cherry-leaves.zip to inputs/cherryleaves_dataset
 96%|████████████████████████████████████▌ | 53.0M/55.0M [00:01<00:00, 41.0MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:01<00:00, 29.0MB/s]


Unzip the downloaded file, and delete the zip file.

In [28]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data Preparation

---

## Data cleaning

### Check and remove non-image files

In [38]:
import os

def remove_non_image_file(my_data_dir):
    """
    Remove non-image files from the given dataset directory.
    """
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    
    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        
        # Ensure it's a directory
        if os.path.isdir(folder_path):
            files = os.listdir(folder_path)
            
            i = []  # Track non-image files removed
            j = []  # Track image files

            for given_file in files:
                file_location = os.path.join(folder_path, given_file)
                
                # Check if it's a file and not a directory
                if os.path.isfile(file_location):
                    if not given_file.lower().endswith(image_extension):
                        os.remove(file_location)  # Remove non-image file
                        i.append(1)
                        print(f"Removed non-image file: {file_location}")
                    else:
                        j.append(1)
                elif os.path.isdir(file_location):
                    print(f"Skipping subdirectory: {file_location}")
            
            print(f"Folder: {folder} - has image files: {len(j)}")
            print(f"Folder: {folder} - has non-image files removed: {len(i)}")


In [42]:
remove_non_image_file(my_data_dir='inputs/cherryleaves_dataset/cherry-leaves')

Folder: healthy - has image files: 2104
Folder: healthy - has non-image files removed: 0
Folder: powdery_mildew - has image files: 2104
Folder: powdery_mildew - has non-image files removed: 0
Skipping subdirectory: inputs/cherryleaves_dataset/cherry-leaves/test/healthy
Skipping subdirectory: inputs/cherryleaves_dataset/cherry-leaves/test/powdery_mildew
Folder: test - has image files: 0
Folder: test - has non-image files removed: 0
Skipping subdirectory: inputs/cherryleaves_dataset/cherry-leaves/train/healthy
Skipping subdirectory: inputs/cherryleaves_dataset/cherry-leaves/train/powdery_mildew
Folder: train - has image files: 0
Folder: train - has non-image files removed: 0
Skipping subdirectory: inputs/cherryleaves_dataset/cherry-leaves/validation/healthy
Skipping subdirectory: inputs/cherryleaves_dataset/cherry-leaves/validation/powdery_mildew
Folder: validation - has image files: 0
Folder: validation - has non-image files removed: 0


## Split train validation test set

In [45]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

 - The training set is divided into a 0.70 ratio of data.
 - The validation set is divided into a 0.10 ratio of data.
 - The test set is divided into a 0.20 ratio of data

In [46]:
split_train_validation_test_images(my_data_dir=f"inputs/cherryleaves_dataset/cherry-leaves",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

## Resize images

Resizing images in datasets to 100px x 100px for further process.

In [34]:
# Parts of code from Walkthrough project 1 - Code Institute 
# Parts of code from https://stackoverflow.com/questions/48121916/numpy-resize-rescale-image

import cv2

data_dir = 'inputs/cherryleaves_dataset/cherry-leaves'

def image_resize(data_dir, new_size=(100, 100)):
    total_files_processed = 0
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(root, file)
                
                img = cv2.imread(file_path)
                img_resized = cv2.resize(img, dsize=new_size, interpolation=cv2.INTER_CUBIC)
                
                cv2.imwrite(file_path, img_resized)
                
                total_files_processed += 1

    print(f"Processed {total_files_processed} files in {data_dir}")
    print(f"All images resized to {new_size[0]}px x {new_size[1]}px")

# Resize images in the train, validation, and test directories
print("Processing train dataset...")
image_resize(os.path.join(data_dir, 'train'))

print("Processing validation dataset...")
image_resize(os.path.join(data_dir, 'validation'))

print("Processing test dataset...")
image_resize(os.path.join(data_dir, 'test'))

Processing train dataset...
Processed 0 files in inputs/cherryleaves_dataset/cherry-leaves/train
All images resized to 100px x 100px
Processing validation dataset...
Processed 0 files in inputs/cherryleaves_dataset/cherry-leaves/validation
All images resized to 100px x 100px
Processing test dataset...
Processed 0 files in inputs/cherryleaves_dataset/cherry-leaves/test
All images resized to 100px x 100px
