In [None]:
import os
import cv2
import numpy as np
import hashlib
from sklearn.utils import shuffle

In [None]:
def load_dataset(directory):
    images = []
    labels = []
    class_names = []

    # Read directory and get class list
    df = sorted(os.listdir(directory))

    # Iterate through each class
    for i, nama_kelas in enumerate(df):
        class_dir = os.path.join(directory, nama_kelas)

        # Read each image in the class
        for file_name in os.listdir(class_dir):
            if file_name.lower().endswith('.jpg'):
                image_path = os.path.join(class_dir, file_name)
                image = cv2.imread(image_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                images.append(image)
                labels.append(i)

        # Store class name
        class_names.append(nama_kelas)

    return np.array(images, dtype=object), np.array(labels), class_names

def count_samples_per_class(labels, class_names):
    sample_counts = []
    for nama_kelas in class_names:
        count = np.sum(labels == class_names.index(nama_kelas))
        sample_counts.append(count)
    return sample_counts

def rotate_image(image, angle):

    # Rotate the image by the specified angle.
    if angle == 90:
        return np.rot90(image)
    else:
        raise ValueError("Only 90 degree rotation is supported.")

def image_exists(image, images):

    # Check if the image already exists in the dataset.
    for img in images:
        if np.array_equal(image, img):
            return True
    return False

def random_oversampling(images, labels, minority_class, oversampling_ratio):
    # Determine the indices of samples from the minority class
    minority_indices = np.where(labels == minority_class)[0]
    minority_count = len(minority_indices)
    target_count = int(oversampling_ratio * minority_count)

    oversampled_images = []
    oversampled_labels = []

    while len(oversampled_images) < target_count:
        random_index = np.random.choice(minority_indices)
        random_image = images[random_index]

        # Rotate the image by 90 degrees
        rotated_image = rotate_image(random_image, 90)

        if not image_exists(rotated_image, images):
            oversampled_images.append(rotated_image)
            oversampled_labels.append(minority_class)

    # Combine the original dataset with the oversampled results
    images = np.concatenate((images, oversampled_images), axis=0)
    labels = np.concatenate((labels, oversampled_labels), axis=0)

    # Shuffle the dataset
    images, labels = shuffle(images, labels)

    return images, labels

# Path to the directory containing the images
directory = '/content/drive/MyDrive/Capstone/data mentah'
images, labels, class_names = load_dataset(directory)

# Display dataset information
print("Number of classes:", len(class_names))
print("Total number of images:", len(images))

# Apply random oversampling to the minority class
minority_class = 'Gray_Leaf_Spot'
minority_class_index = class_names.index(minority_class)
oversampled_images, oversampled_labels = random_oversampling(images, labels, minority_class_index, oversampling_ratio=1.0)

# Display information after oversampling
print("Total number of images after oversampling:", len(oversampled_images))

In [None]:
# Count samples per class before and after oversampling
original_sample_counts = count_samples_per_class(labels, class_names)
oversampled_sample_counts = count_samples_per_class(oversampled_labels, class_names)

print("\nNumber of samples per class before oversampling:")
for i, nama_kelas in enumerate(class_names):
    print(f"{nama_kelas}: {original_sample_counts[i]}")

print("\nNumber of samples per class after oversampling:")
for i, nama_kelas in enumerate(class_names):
    print(f"{nama_kelas}: {oversampled_sample_counts[i]}")

# Save the oversampled data to the output directory
output_dir = '/content/drive/MyDrive/Capstone/data_oversampling'
os.makedirs(output_dir, exist_ok=True)

for i in range(len(oversampled_images)):
    nama_kelas = class_names[oversampled_labels[i]]
    image_name = f"{nama_kelas}_{i}.jpg"
    image_path = os.path.join(output_dir, nama_kelas, image_name)

    os.makedirs(os.path.dirname(image_path), exist_ok=True)
    cv2.imwrite(image_path, cv2.cvtColor(oversampled_images[i], cv2.COLOR_RGB2BGR))

print("\nOversampled data has been saved to the directory:", output_dir)

In [None]:
from PIL import Image

def compute_image_hash(image_path):

    #Compute the hash of an image
    with Image.open(image_path) as img:
        img = img.resize((256, 256)).convert('L')
        return hashlib.md5(img.tobytes()).hexdigest()

def find_duplicates(directory):

    #Find and print duplicates in the given directory.
    file_hashes = {}
    duplicates = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.jpg'):
                file_path = os.path.join(root, file)
                file_hash = compute_image_hash(file_path)

                if file_hash in file_hashes:
                    duplicates.append((file_path, file_hashes[file_hash]))
                    print(f"Duplicate found: {file_path} is a duplicate of {file_hashes[file_hash]}")
                else:
                    file_hashes[file_hash] = file_path

    if not duplicates:
        print("No duplicates found.")
    else:
        print(f"Total {len(duplicates)} duplicates found.")

    return duplicates

# Path to the directory containing the images
directory = "/content/drive/MyDrive/Capstone/data_oversampling"
duplicates = find_duplicates(directory)

# Optionally save the list of duplicates to a file
with open("duplicates.txt", "w") as f:
    for duplicate in duplicates:
        f.write(f"{duplicate[0]} is a duplicate of {duplicate[1]}\n")
    print("List of duplicates has been saved to 'duplicates.txt'.")

In [None]:
def remove_duplicates(duplicates):

    #Remove duplicate files.
    for duplicate in duplicates:
        duplicate_file = duplicate[0]
        try:
            os.remove(duplicate_file)
            print(f"Removed duplicate file: {duplicate_file}")
        except Exception as e:
            print(f"Error removing file {duplicate_file}: {e}")

remove_duplicates(duplicates)

In [None]:
# Specify the directory path within Google Drive
directory_path = '/content/drive/MyDrive/Capstone/data_oversampling/Blight'

# Count files
file_count = len([name for name in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, name))])

print(f'Total number of files in "{directory_path}": {file_count}')

In [None]:
# Specify the directory path within Google Drive
directory_path = '/content/drive/MyDrive/Capstone/data_oversampling/Common_Rust'

# Count files
file_count = len([name for name in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, name))])

print(f'Total number of files in "{directory_path}": {file_count}')

In [None]:
# Specify the directory path within Google Drive
directory_path = '/content/drive/MyDrive/Capstone/data_oversampling/Gray_Leaf_Spot'

# Count files
file_count = len([name for name in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, name))])

print(f'Total number of files in "{directory_path}": {file_count}')

In [None]:
# Specify the directory path within Google Drive
directory_path = '/content/drive/MyDrive/Capstone/data_oversampling/Healthy'

# Count files
file_count = len([name for name in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, name))])

print(f'Total number of files in "{directory_path}": {file_count}')