# Downloading the Data (DON'T EXECUTE THIS CODE UNLESS YOU REALLY NEED THE DATA!)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abdallahalidev/plantvillage-dataset")

print("Path to dataset files:", path)

In [None]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import random

### Import Data into this a data directory on the same level as notebooks

In [None]:
data_dir = Path("/Users/laurinlotscher/.cache/kagglehub/datasets/abdallahalidev/plantvillage-dataset/versions/3/plantvillage dataset/color")

In [None]:
counts = {
    cls.name: len(list(cls.glob("*.jpg"))) + len(list(cls.glob("*.JPG")))
    for cls in data_dir.iterdir() if cls.is_dir()
}
df = pd.DataFrame(list(counts.items()), columns=["class", "n_images"]).sort_values("n_images", ascending=False)
df.head()

# Plot classes and count

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(y="class", x="n_images", data=df, palette="viridis")
plt.title("Number of images per class")
plt.xlabel("Images")
plt.ylabel("Class")
plt.show()

# Different Plants and diseases

In [None]:
df["plant"] = df["class"].apply(lambda x: x.split("___")[0])
df["condition"] = df["class"].apply(lambda x: x.split("___")[1] if "___" in x else "unknown")

df.head()

In [None]:
plant_counts = df.groupby("plant")["n_images"].sum().sort_values(ascending=False)

plt.figure(figsize=(8,6))
plant_counts.plot(kind="bar", color="skyblue")
plt.title("Number of images per plant")
plt.ylabel("Images")
plt.show()

In [None]:
plant_condition_counts = df.groupby(["plant", "condition"])["n_images"].sum().reset_index()

plant_condition_counts.head(10)

In [None]:
tomato = plant_condition_counts[plant_condition_counts["plant"]=="Tomato"]

plt.figure(figsize=(8,5))
plt.barh(tomato["condition"], tomato["n_images"], color="tomato")
plt.title("Tomato diseases (image count)")
plt.xlabel("Number of images")
plt.gca().invert_yaxis()
plt.show()

# Healthy vs Sick 

In [None]:
df['status'] = df['class'].apply(lambda x: 'healthy' if 'healthy' in x else 'sick')
status_counts = df.groupby("status")["n_images"].sum()

status_counts.plot(kind="bar", color=["green","red"])
plt.title("Healthy vs Diseased Images")
plt.ylabel("Number of images")
plt.show()

# Class per plant

In [None]:
class_counts = (
    df.groupby("plant")["class"]
      .nunique()
      .sort_values(ascending=False)
)

colors = ["red" if n == 1 else "skyblue" for n in class_counts.values]

plt.figure(figsize=(8,6))
class_counts.plot(kind="bar", color=colors)
plt.title("Number of classes per plant")
plt.ylabel("Unique classes")
plt.xlabel("Plant")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

# Classes and Images count correlation

In [None]:
# aggregate per plant
per_plant = (
    df.groupby("plant")
      .agg(classes=("class", "nunique"), images=("n_images", "sum"))
      .reset_index()
)

plt.figure(figsize=(8,6))
plt.scatter(per_plant["classes"], per_plant["images"], s=80, color="seagreen", alpha=0.7)

# add labels for each plant
for _, row in per_plant.iterrows():
    plt.text(row["classes"]+0.05, row["images"], row["plant"], fontsize=9)

plt.xlabel("Number of classes (diseases + healthy)")
plt.ylabel("Number of images")
plt.title("Classes vs. Images per Plant")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

# Data density per class

In [None]:
# aggregate by plant
per_plant = (
    df.groupby("plant")
      .agg(classes=("class", "nunique"), images=("n_images", "sum"))
      .reset_index()
)

# calculate images per class (density)
per_plant["density"] = per_plant["images"] / per_plant["classes"]

# sort by density
per_plant_sorted = per_plant.sort_values("density", ascending=False)
per_plant_sorted.head(10)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(y="plant", x="density", data=per_plant_sorted, palette="viridis")
plt.title("Data Density: Images per Class per Plant")
plt.xlabel("Images per class (density)")
plt.ylabel("Plant")
plt.show()

# Image size stats

In [None]:
sizes = []
for cls in data_dir.iterdir():
    for img_path in cls.glob("*.jpg"):
        with Image.open(img_path) as img:
            sizes.append(img.size)

sizes_df = pd.DataFrame(sizes, columns=["width","height"])
print(sizes_df.describe())