In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random
from PIL import Image

In [38]:
# Define paths to the folders
train_dir = '../data/train'
test_dir = '../data/test'
valid_dir = '../data/valid'

In [41]:
data_folder = '../data'

In [64]:
# Function to collect images and labels from a folder
def collect_images_from_folder(folder_path):
    data = []
    for label in os.listdir(folder_path):
        label_path = os.path.join(folder_path, label)
        if os.path.isdir(label_path):  # Ensure it's a directory
            for img_name in os.listdir(label_path):
                if img_name.endswith(('.png', '.jpg', '.jpeg')):  # Check for valid image files
                    img_path = os.path.join(label_path, img_name)
                    img_path = img_path.replace('\\', '/')  # Standardize path slashes
                    data.append((img_path, label))  # Append image path and label as a tuple
    return pd.DataFrame(data, columns=['image_path', 'label'])  # Return as a DataFrame

In [65]:
# Create DataFrames for train, valid, and test
train_df = collect_images_from_folder(train_dir)
valid_df = collect_images_from_folder(valid_dir)
test_df = collect_images_from_folder(test_dir)

In [67]:
# Create a master DataFrame with all images (train + valid + test)
all_data_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

In [77]:
# Display the shape of each DataFrame to verify the data
print("Training set shape:", train_df.shape)
print("Validation set shape:", valid_df.shape)
print("Test set shape:", test_df.shape)
print("Total dataset shape:", all_data_df.shape)

Training set shape: (1300, 2)
Validation set shape: (260, 2)
Test set shape: (26, 2)
Total dataset shape: (1586, 2)


In [78]:
# Example: Display the first few rows of each DataFrame
print("Training DataFrame sample:\n", train_df.head())
print("Validation DataFrame sample:\n", valid_df.head())
print("All Data DataFrame sample:\n", all_data_df.head())

Training DataFrame sample:
                                        image_path   label
0        ../data/train/Aphids/00_aphids_train.jpg  Aphids
1   ../data/train/Aphids/00_aphids_train_aug0.jpg  Aphids
2  ../data/train/Aphids/00_aphids_train_aug10.jpg  Aphids
3  ../data/train/Aphids/00_aphids_train_aug20.jpg  Aphids
4  ../data/train/Aphids/00_aphids_train_aug30.jpg  Aphids
Validation DataFrame sample:
                                        image_path   label
0        ../data/valid/Aphids/00_aphids_valid.jpg  Aphids
1   ../data/valid/Aphids/00_aphids_valid_aug0.jpg  Aphids
2  ../data/valid/Aphids/00_aphids_valid_aug10.jpg  Aphids
3  ../data/valid/Aphids/00_aphids_valid_aug12.jpg  Aphids
4  ../data/valid/Aphids/00_aphids_valid_aug14.jpg  Aphids
All Data DataFrame sample:
                                        image_path   label
0        ../data/train/Aphids/00_aphids_train.jpg  Aphids
1   ../data/train/Aphids/00_aphids_train_aug0.jpg  Aphids
2  ../data/train/Aphids/00_aphids_train_aug1

In [63]:
class_distribution = all_data_df['label'].value_counts()
print(class_distribution)

label
Aphids                   122
Botrytis                 122
Dehydration              122
Healthy                  122
Leaf Miners              122
Nitrogen Deficiency      122
Nutrient Burn            122
Overwatering             122
PH Fluctuation           122
Phosphorus Deficiency    122
Potassium Deficiency     122
Powdery Mildew           122
Septoria                 122
Name: count, dtype: int64


In [68]:
train_df['label'].value_counts()

label
Aphids                   100
Botrytis                 100
Dehydration              100
Healthy                  100
Leaf Miners              100
Nitrogen Deficiency      100
Nutrient Burn            100
Overwatering             100
PH Fluctuation           100
Phosphorus Deficiency    100
Potassium Deficiency     100
Powdery Mildew           100
Septoria                 100
Name: count, dtype: int64

In [69]:
valid_df['label'].value_counts()

label
Aphids                   20
Botrytis                 20
Dehydration              20
Healthy                  20
Leaf Miners              20
Nitrogen Deficiency      20
Nutrient Burn            20
Overwatering             20
PH Fluctuation           20
Phosphorus Deficiency    20
Potassium Deficiency     20
Powdery Mildew           20
Septoria                 20
Name: count, dtype: int64

In [None]:
# Display labels and images
# def display_sample_images(dataframe, label, n=13)
#     sample_images = train_df[train_df['label']]

In [88]:
# Display the first images from the train set
def plot_images_from_df(train_df, title):
    num_images = len(train_df)
    fig, axes = plt.subplots(3, 5, figsize=(20, 15))
    fig.suptitle(title, fontsize=16)

    for i, (img_path, label) in enumerate(train_df.values):
        img = Image.open(train_df["image_path"])
        axes[i].imshow(img)
        axes[i].axis('off')
        axes[i].set_title(label)

    plt.show()

In [None]:
# Plot first image from each label in train and valid sets
plot_images_from_df(train_df, "First Image from Each Label in the Training Set")

In [None]:
# Display 5 random samples from the "Healthy" class
# display_sample_images(train_df, 'Healthy', n=5)

In [None]:
# Display 5 random samples from the "Potassium Deficiency" class
# display_sample_images(train_df, 'Potassium Deficiency', n=5)

In [None]:
# Visualize class distribution
# train_df['label'].value_counts().plot(kind='bar', title='Class Distribution in Training Set')
# plt.show()

In [19]:
# Training set class distribution
# train_class_distribution = train_df['label'].value_counts()

In [None]:
# Pie chart for training set class distribution
# plt.figure(figsize=(8, 8))
# plt.pie(train_class_distribution, labels=train_class_distribution.index, autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors)
# plt.title('Class Distribution (Training Set)')
# plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
# plt.show()