# 📊 Exploratory Data Analysis (EDA) and Data Cleaning

This notebook performs comprehensive EDA on the processed PlantVillage dataset and handles:
- Data exploration and visualization
- Null value handling
- Data normalization
- Class balancing
- Feature analysis

In [1]:
# Import required libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Image processing
import cv2
from PIL import Image
import imageio
from skimage import io, transform, filters, exposure

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix

# Utilities
from collections import Counter, defaultdict
from tqdm import tqdm
import json

# Set style
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette("husl")

print("📚 Libraries imported successfully!")

📚 Libraries imported successfully!


In [2]:
# Define paths
BASE_PATH = Path("/Users/debabratapattnayak/web-dev/greencast")
PROCESSED_DATA_PATH = BASE_PATH / "processed_data"
FEATURES_PATH = BASE_PATH / "features"
MODELS_PATH = BASE_PATH / "models"

# Create directories
FEATURES_PATH.mkdir(parents=True, exist_ok=True)
MODELS_PATH.mkdir(parents=True, exist_ok=True)

# Dataset paths
SYMLINK_DATASET_PATH = PROCESSED_DATA_PATH / "plantvillage_color_symlinks"
COPY_DATASET_PATH = PROCESSED_DATA_PATH / "plantvillage_color"

# Choose the dataset to use (prefer symlinks to save space)
if SYMLINK_DATASET_PATH.exists():
    DATASET_PATH = SYMLINK_DATASET_PATH
    print(f"📁 Using symlinked dataset: {DATASET_PATH}")
elif COPY_DATASET_PATH.exists():
    DATASET_PATH = COPY_DATASET_PATH
    print(f"📁 Using copied dataset: {DATASET_PATH}")
else:
    raise FileNotFoundError("No processed dataset found!")

print(f"✅ Paths configured successfully")

📁 Using symlinked dataset: /Users/debabratapattnayak/web-dev/greencast/processed_data/plantvillage_color_symlinks
✅ Paths configured successfully
