# Import Dependencies

In [None]:
pip install ultralytics

import torch
import torchvision
import torchvision.transforms as transforms
import zipfile
import os
import hashlib
import pandas as pd
from PIL import Image
import numpy as np
import yaml
import cv2
import matplotlib.pyplot as plt
from ultralytics import YOLO

In [None]:
trainset = torchvision.datasets.CIFAR10(root='./data',
                                        train=True,
                                        download=True,
                                        transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]))
trainloader = torch.utils.data.DataLoader(trainset,
                                          batch_size=4,
                                          shuffle=True)
images, labels = next(iter(trainloader))
plt.imshow(torchvision.utils.make_grid(images).permute(1, 2, 0) / 2 + 0.5);
plt.title(' '.join(trainset.classes[label] for label in labels)); plt.show()

#CLEAN DATA

### Import dataset

In [None]:
!wget -O "DeepFruits_Dataset.zip" "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/5prc54r4rt-1.zip"

### extract to development

In [None]:
with zipfile.ZipFile("/content/DeepFruits_Dataset.zip", 'r') as zip_ref:
    zip_ref.extractall('/content/deepfruits')

print("Dataset has been extracted to '/content/deepfruits'")


In [None]:
dataset_path = '/content/deepfruits'
for root, dirs, files in os.walk(dataset_path):
    level = root.replace(dataset_path, '').count(os.sep)
    indent = ' ' * 2 * (level)
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = ' ' * 2 * (level + 1)
    for f in files[:5]:  # Displaying only first 5 files in each directory for brevity
        print(f"{sub_indent}{f}")


In [None]:
dataset_path = '/content/deepfruits/DeepFruits Dataset of Fruits Images with different combinations for Fruit Recognition and Calories Estimation/Fruits_Dataset_Test'

# Define your label mapping
label_map = {
    "1": "apple",
    "2": "banana",
    "3": "mango",
    "4": "orange",
    "5": "grapes",
    "6": "guava",
    "7": "lemon",
    "8": "pomegranate"
}

data = []

for root, dirs, files in os.walk(dataset_path):
    folder_name = os.path.basename(root).strip()
    if folder_name not in label_map:
        continue  # skip folders that aren't in the label map

    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            full_path = os.path.join(root, file)
            label = label_map[folder_name]
            data.append({'image_path': full_path, 'label': label})

df = pd.DataFrame(data)
print(df.head())


### Load in Test data

In [None]:
dataset_path = '/content/deepfruits/DeepFruits Dataset of Fruits Images with different combinations for Fruit Recognition and Calories Estimation/Fruits_Dataset_Test'

data = []

for root, dirs, files in os.walk(dataset_path):
    folder_name = os.path.basename(root)

    if folder_name == os.path.basename(dataset_path):
        continue

    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(root, file)
            try:
                img = Image.open(image_path).convert('RGB')
                img_array = np.array(img)

                # Append image data, label, and filename
                data.append({
                    'image': img_array,
                    'label': folder_name,
                    'filename': file
                })
            except Exception as e:
                print(f"Failed to load image: {image_path} — {e}")

# Create DataFrame
df_test = pd.DataFrame(data)

# Preview it
print(df_test.head())
print(f"\nLoaded {len(df_test)} images.")


In [None]:
df_test.head()

In [None]:
df_test['image'][0]

### Load Train data

In [None]:
dataset_path = '/content/deepfruits/DeepFruits Dataset of Fruits Images with different combinations for Fruit Recognition and Calories Estimation/Fruits_Dataset_Train'

data = []

for root, dirs, files in os.walk(dataset_path):
    folder_name = os.path.basename(root)

    if folder_name == os.path.basename(dataset_path):
        continue

    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(root, file)
            try:
                img = Image.open(image_path).convert('RGB')
                img_array = np.array(img)

                # Add image, label (folder name), and filename
                data.append({
                    'image': img_array,
                    'label': folder_name,
                    'filename': file
                })
            except Exception as e:
                print(f"Failed to load image: {image_path} — {e}")

df_train = pd.DataFrame(data)

# Show a preview
print(df_train.head())
print(f"\nLoaded {len(df_train)} images.")


In [None]:
df_train['image'][0]

### Import the Test labels and put in pandas df

In [None]:
Labels_Test_df = pd.read_csv('/content/deepfruits/DeepFruits Dataset of Fruits Images with different combinations for Fruit Recognition and Calories Estimation/Labels_Test.csv')
Labels_Test_df.rename(columns={'FileName': 'filename'}, inplace=True)
# Display the first few rows of the DataFrame
Labels_Test_df

### Merge test dataframes on filename

In [None]:
merged_test_df = pd.merge(df_test, Labels_Test_df, on='filename')
print(merged_test_df.head())
print(merged_test_df.shape)
print(merged_test_df.isnull().sum())

### import train labels and merge on filename

In [None]:
Labels_Train_df=pd.read_csv('/content/deepfruits/DeepFruits Dataset of Fruits Images with different combinations for Fruit Recognition and Calories Estimation/Labels_Train.csv')
Labels_Train_df.rename(columns={'FileName': 'filename'}, inplace=True)
merged_train_df = pd.merge(df_train, Labels_Train_df, on='filename')
print(merged_train_df.head())
print(merged_train_df.shape)
print(merged_train_df.isnull().sum())

# EDA

In [None]:
fruit_columns = [
    "Mango", "Grape", "Plum", "Kiwi", "Pear", "Apple", "Orange", "Banana",
    "Pomegranate", "Strawberry", "Pineapple", "Fig", "Peach", "Apricot",
    "Avocado", "Summer Squash", "Lemon", "Lime", "Guava", "Raspberry"
]

# Sum the counts for each fruit across all rows (images)
fruit_totals = merged_train_df[fruit_columns].sum().sort_values(ascending=False)

# Plot
plt.figure(figsize=(12, 6))
fruit_totals.plot(kind='bar', edgecolor='black')
plt.title('Total Fruit Counts Across Training Set')
plt.xlabel('Fruit Type')
plt.ylabel('Total Count in Images')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
# Add image dimensions to DataFrame
merged_train_df['height'] = merged_train_df['image'].apply(lambda img: img.shape[0])
merged_train_df['width'] = merged_train_df['image'].apply(lambda img: img.shape[1])

# Plot histograms
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
merged_train_df['height'].plot.hist(bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Image Heights')
plt.xlabel('Pixels')

plt.subplot(1, 2, 2)
merged_train_df['width'].plot.hist(bins=30, color='salmon', edgecolor='black')
plt.title('Distribution of Image Widths')
plt.xlabel('Pixels')

plt.tight_layout()
plt.show()


In [None]:
# Function to hash image arrays
def hash_img(img_array):
    return hashlib.md5(img_array.tobytes()).hexdigest()

# Add hash column
merged_train_df['img_hash'] = merged_train_df['image'].apply(hash_img)

# Count duplicates
duplicate_counts = merged_train_df['img_hash'].value_counts()
num_duplicates = sum(duplicate_counts > 1)

print(f"Found {num_duplicates} exact duplicate images.")

# Optional: show duplicate groups
duplicates = merged_train_df[merged_train_df['img_hash'].isin(duplicate_counts[duplicate_counts > 1].index)]


In [None]:
# Approximate brightness using grayscale average
def brightness(img_array):
    return np.mean(img_array)

merged_train_df['brightness'] = merged_train_df['image'].apply(brightness)

# Plot brightness distribution
plt.figure(figsize=(7, 5))
merged_train_df['brightness'].plot.hist(bins=50, edgecolor='black')
plt.title('Image Brightness Distribution')
plt.xlabel('Average Brightness')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()


In [None]:
# Sort by brightness and show extremes

darkest_imgs = merged_train_df.nsmallest(5, 'brightness')
brightest_imgs = merged_train_df.nlargest(5, 'brightness')

def show_images(df_subset, title):
    plt.figure(figsize=(15, 3))
    for i, img_array in enumerate(df_subset['image']):
        plt.subplot(1, 5, i + 1)
        plt.imshow(img_array)
        plt.axis('off')
    plt.suptitle(title)
    plt.show()

show_images(darkest_imgs, "Darkest Images")
show_images(brightest_imgs, "Brightest Images")


In [None]:
# Find total number of fruits in each picture
cols_to_sum = ["Mango", "Grape", "Plum", "Kiwi", "Pear", "Apple", "Orange", "Banana",
    "Pomegranate", "Strawberry", "Pineapple", "Fig", "Peach", "Apricot",
    "Avocado", "Summer Squash", "Lemon", "Lime", "Guava", "Raspberry"]

merged_train_df['row_sum'] = merged_train_df[cols_to_sum].sum(axis=1)

merged_train_df['row_sum'].hist()
plt.title('Distribution of Total Number of Fruits in Each Image')
plt.xlabel('Sum Fruits')
plt.ylabel('Frequency')
plt.show()

merged_train_df = merged_train_df.drop('row_sum', axis=1)

#YOLO

In [None]:
#Make YAML
fruits_yaml = {
    'path': '/content/deepfruits/dataset',
    'train': 'images/train',
    'val': 'images/test',
    'names': {
        0: 'Apple',
        1: 'Banana',
        2: 'Orange',
        3: 'Grape'
    }
}

with open('/content/deepfruits/fruits.yaml', 'w') as f:
    yaml.dump(fruits_yaml, f)

print("✅ Capitalized fruits.yaml created!")

In [None]:
target_classes = ['Apple', 'Banana', 'Orange', 'Grape']
class_to_id = {fruit: idx for idx, fruit in enumerate(target_classes)}

In [None]:
# TRAIN FOLDER
# Paths to save labels and images
img_save_dir = '/content/deepfruits/dataset/images/train'
label_save_dir = '/content/deepfruits/dataset/labels/train'
os.makedirs(img_save_dir, exist_ok=True)
os.makedirs(label_save_dir, exist_ok=True)

# Function to create random bounding boxes
def create_random_boxes(count, img_w, img_h):
    boxes = []
    for _ in range(count):
        w = np.random.uniform(0.1, 0.3) * img_w
        h = np.random.uniform(0.1, 0.3) * img_h
        x = np.random.uniform(0, img_w - w)
        y = np.random.uniform(0, img_h - h)

        # Normalize
        x_center = (x + w / 2) / img_w
        y_center = (y + h / 2) / img_h
        w_norm = w / img_w
        h_norm = h / img_h
        boxes.append((x_center, y_center, w_norm, h_norm))
    return boxes

# Create YOLO label files
for idx, row in merged_train_df.iterrows():
    img_array = row['image']
    filename = row['filename']
    image_path = os.path.join(img_save_dir, filename)

    # Save image to disk
    img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
    cv2.imwrite(image_path, img_bgr)

    h, w, _ = img_array.shape
    yolo_lines = []

    for fruit in target_classes:
        count = int(row.get(fruit, 0))
        boxes = create_random_boxes(count, w, h)
        for box in boxes:
            line = f"{class_to_id[fruit]} {box[0]:.6f} {box[1]:.6f} {box[2]:.6f} {box[3]:.6f}"
            yolo_lines.append(line)

    # Write YOLO label file
    label_filename = os.path.splitext(filename)[0] + '.txt'
    with open(os.path.join(label_save_dir, label_filename), 'w') as f:
        f.write('\n'.join(yolo_lines))

In [None]:
# TEST FOLDER
# Paths to save labels and images
img_save_dir = '/content/deepfruits/dataset/images/test'
label_save_dir = '/content/deepfruits/dataset/labels/test'
os.makedirs(img_save_dir, exist_ok=True)
os.makedirs(label_save_dir, exist_ok=True)

# Function to create random bounding boxes
def create_random_boxes(count, img_w, img_h):
    boxes = []
    for _ in range(count):
        w = np.random.uniform(0.1, 0.3) * img_w
        h = np.random.uniform(0.1, 0.3) * img_h
        x = np.random.uniform(0, img_w - w)
        y = np.random.uniform(0, img_h - h)

        # Normalize
        x_center = (x + w / 2) / img_w
        y_center = (y + h / 2) / img_h
        w_norm = w / img_w
        h_norm = h / img_h
        boxes.append((x_center, y_center, w_norm, h_norm))
    return boxes

# Create YOLO label files
for idx, row in merged_test_df.iterrows():
    img_array = row['image']
    filename = row['filename']
    image_path = os.path.join(img_save_dir, filename)

    # Save image to disk
    img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
    cv2.imwrite(image_path, img_bgr)

    h, w, _ = img_array.shape
    yolo_lines = []

    for fruit in target_classes:
        count = int(row.get(fruit, 0))
        boxes = create_random_boxes(count, w, h)
        for box in boxes:
            line = f"{class_to_id[fruit]} {box[0]:.6f} {box[1]:.6f} {box[2]:.6f} {box[3]:.6f}"
            yolo_lines.append(line)

    # Write YOLO label file
    label_filename = os.path.splitext(filename)[0] + '.txt'
    with open(os.path.join(label_save_dir, label_filename), 'w') as f:
        f.write('\n'.join(yolo_lines))

CHECK previous work

In [None]:
# Example: show one annotation file
label_path ='/content/deepfruits/dataset/labels/test/20190106_072818.txt'
with open(label_path, 'r') as f:
    print(f.read())

In [None]:
# Define class names
class_names = ['apple', 'banana', 'orange', 'grape']

# Example: visualize one image and its labels
image_dir = '/content/deepfruits/dataset/images/train'
label_dir = '/content/deepfruits/dataset/labels/train'

example_filename = 'IMG_123.jpg'  # Replace with a real filename

image_path = os.path.join(image_dir, example_filename)
label_path = os.path.join(label_dir, example_filename.replace('.jpg', '.txt'))

# Load image
img = cv2.imread(image_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
h, w, _ = img.shape

# Plot bounding boxes
if os.path.exists(label_path):
    with open(label_path, 'r') as f:
        for line in f.readlines():
            cls_id, xc, yc, bw, bh = map(float, line.strip().split())
            # Convert to pixel coordinates
            x1 = int((xc - bw / 2) * w)
            y1 = int((yc - bh / 2) * h)
            x2 = int((xc + bw / 2) * w)
            y2 = int((yc + bh / 2) * h)

            color = (255, 0, 0)
            label = class_names[int(cls_id)]
            cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
            cv2.putText(img, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

# Show image with boxes
plt.figure(figsize=(8, 8))
plt.imshow(img)
plt.axis('off')
plt.title(example_filename)
plt.show()


Train model (determine epoch and amount of images used for training)

In [None]:
model = YOLO('yolov8n.pt')

model.train(data='/content/deepfruits/fruits.yaml', epochs=100, imgsz=416, batch=6,fraction=0.35)

Test model

In [None]:
model.predict(
    source='/content/deepfruits/dataset/images/test',
    classes=[0, 1, 2, 3],
    save=True,
    conf=0.25
)

In [None]:
metrics = model.val(data='/content/deepfruits/fruits.yaml')

Statistical testing- mAP

In [None]:
print(f"📊 mAP@0.5: {metrics.box.map50:.3f}")
print(f"📈 mAP@0.5:0.95: {metrics.box.map:.3f}")



In [None]:
def show_yolo_labels(image_path, label_path, class_names):
    img = cv2.imread(image_path)
    h, w = img.shape[:2]

    with open(label_path, 'r') as f:
        for line in f.readlines():
            cls, x, y, bw, bh = map(float, line.strip().split())
            x1 = int((x - bw/2) * w)
            y1 = int((y - bh/2) * h)
            x2 = int((x + bw/2) * w)
            y2 = int((y + bh/2) * h)
            cv2.rectangle(img, (x1, y1), (x2, y2), (0,255,0), 2)
            cv2.putText(img, class_names[int(cls)], (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img_rgb)
    plt.axis('off')
    plt.show()
