# FreshHarvest Data Analysis

This notebook provides comprehensive data analysis for the FreshHarvest fruit freshness classification dataset.

## Analysis Overview
- Dataset structure and statistics
- Class distribution analysis
- Image quality assessment
- Data preprocessing insights
- Visualization of sample images
- Data quality recommendations

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

from cvProject_FreshHarvest.utils.common import read_yaml, setup_logging

# Setup
setup_logging()
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("FreshHarvest Data Analysis Notebook")
print("=" * 40)

## 1. Dataset Overview

In [None]:
# Load configuration
config = read_yaml('../config/config.yaml')
print("Configuration loaded:")
print(f"- Image size: {config['data']['image_size']}")
print(f"- Number of classes: {config['data']['num_classes']}")
print(f"- Batch size: {config['training']['batch_size']}")

# Define paths
data_paths = {
    'raw': '../data/raw',
    'processed': '../data/processed',
    'train': '../data/processed/train',
    'val': '../data/processed/val',
    'test': '../data/processed/test'
}

# Class names
CLASS_NAMES = [
    'F_Banana', 'F_Lemon', 'F_Lulo', 'F_Mango', 'F_Orange', 'F_Strawberry', 'F_Tamarillo', 'F_Tomato',
    'S_Banana', 'S_Lemon', 'S_Lulo', 'S_Mango', 'S_Orange', 'S_Strawberry', 'S_Tamarillo', 'S_Tomato'
]

print(f"\nClass names: {CLASS_NAMES}")

## 2. Dataset Structure Analysis

In [None]:
def analyze_dataset_structure(data_path):
    """Analyze the structure of the dataset."""
    
    if not os.path.exists(data_path):
        print(f"⚠️ Path does not exist: {data_path}")
        return None
    
    structure = {}
    total_images = 0
    
    print(f"\n📁 Analyzing: {data_path}")
    print("-" * 50)
    
    for class_name in os.listdir(data_path):
        class_path = os.path.join(data_path, class_name)
        if os.path.isdir(class_path):
            # Count images in class directory
            image_files = [f for f in os.listdir(class_path) 
                          if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            count = len(image_files)
            structure[class_name] = count
            total_images += count
            
            print(f"{class_name:15}: {count:5d} images")
    
    print("-" * 50)
    print(f"{'Total':15}: {total_images:5d} images")
    
    return structure

# Analyze each dataset split
dataset_structures = {}
for split_name, split_path in data_paths.items():
    if split_name in ['train', 'val', 'test']:
        dataset_structures[split_name] = analyze_dataset_structure(split_path)

## 3. Class Distribution Visualization

In [None]:
# Create class distribution visualization
def plot_class_distribution(dataset_structures):
    """Plot class distribution across dataset splits."""
    
    # Prepare data for plotting
    plot_data = []
    
    for split_name, structure in dataset_structures.items():
        if structure:
            for class_name, count in structure.items():
                plot_data.append({
                    'Split': split_name,
                    'Class': class_name,
                    'Count': count,
                    'Fruit': class_name.split('_')[1] if '_' in class_name else class_name,
                    'Condition': 'Fresh' if class_name.startswith('F_') else 'Spoiled'
                })
    
    if not plot_data:
        print("⚠️ No data available for plotting")
        return
    
    df = pd.DataFrame(plot_data)
    
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Class distribution by split
    df_pivot = df.pivot_table(index='Class', columns='Split', values='Count', fill_value=0)
    df_pivot.plot(kind='bar', ax=axes[0, 0], width=0.8)
    axes[0, 0].set_title('Class Distribution by Dataset Split')
    axes[0, 0].set_xlabel('Class')
    axes[0, 0].set_ylabel('Number of Images')
    axes[0, 0].tick_params(axis='x', rotation=45)
    axes[0, 0].legend()
    
    # 2. Fresh vs Spoiled distribution
    condition_counts = df.groupby(['Split', 'Condition'])['Count'].sum().reset_index()
    condition_pivot = condition_counts.pivot(index='Split', columns='Condition', values='Count')
    condition_pivot.plot(kind='bar', ax=axes[0, 1], width=0.6)
    axes[0, 1].set_title('Fresh vs Spoiled Distribution')
    axes[0, 1].set_xlabel('Dataset Split')
    axes[0, 1].set_ylabel('Number of Images')
    axes[0, 1].tick_params(axis='x', rotation=0)
    axes[0, 1].legend()
    
    # 3. Fruit type distribution
    fruit_counts = df.groupby(['Split', 'Fruit'])['Count'].sum().reset_index()
    fruit_pivot = fruit_counts.pivot(index='Fruit', columns='Split', values='Count')
    fruit_pivot.plot(kind='bar', ax=axes[1, 0], width=0.8)
    axes[1, 0].set_title('Distribution by Fruit Type')
    axes[1, 0].set_xlabel('Fruit Type')
    axes[1, 0].set_ylabel('Number of Images')
    axes[1, 0].tick_params(axis='x', rotation=45)
    axes[1, 0].legend()
    
    # 4. Total dataset composition
    total_by_split = df.groupby('Split')['Count'].sum()
    axes[1, 1].pie(total_by_split.values, labels=total_by_split.index, autopct='%1.1f%%')
    axes[1, 1].set_title('Dataset Split Composition')
    
    plt.tight_layout()
    plt.show()
    
    return df

# Plot class distribution
df_analysis = plot_class_distribution(dataset_structures)

## 4. Image Quality Analysis

In [None]:
def analyze_image_quality(data_path, sample_size=100):
    """Analyze image quality metrics."""
    
    if not os.path.exists(data_path):
        print(f"⚠️ Path does not exist: {data_path}")
        return None
    
    quality_metrics = {
        'widths': [],
        'heights': [],
        'channels': [],
        'file_sizes': [],
        'brightness': [],
        'contrast': []
    }
    
    sample_count = 0
    
    print(f"\n🔍 Analyzing image quality from: {data_path}")
    print(f"Sample size: {sample_size} images")
    
    for class_name in os.listdir(data_path):
        class_path = os.path.join(data_path, class_name)
        if os.path.isdir(class_path):
            image_files = [f for f in os.listdir(class_path) 
                          if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            
            # Sample images from this class
            class_sample_size = min(sample_size // 16, len(image_files))  # Distribute across classes
            sampled_files = np.random.choice(image_files, class_sample_size, replace=False)
            
            for img_file in sampled_files:
                if sample_count >= sample_size:
                    break
                    
                img_path = os.path.join(class_path, img_file)
                
                try:
                    # Load image
                    img = cv2.imread(img_path)
                    if img is not None:
                        height, width, channels = img.shape
                        
                        # Basic metrics
                        quality_metrics['widths'].append(width)
                        quality_metrics['heights'].append(height)
                        quality_metrics['channels'].append(channels)
                        quality_metrics['file_sizes'].append(os.path.getsize(img_path))
                        
                        # Convert to grayscale for brightness/contrast analysis
                        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                        quality_metrics['brightness'].append(np.mean(gray))
                        quality_metrics['contrast'].append(np.std(gray))
                        
                        sample_count += 1
                        
                except Exception as e:
                    print(f"Error processing {img_path}: {e}")
                    continue
    
    print(f"✅ Analyzed {sample_count} images")
    return quality_metrics

# Analyze image quality for training set
train_quality = analyze_image_quality(data_paths['train'], sample_size=200)