In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

ROOT = r"D:\GitHub\AI-for-Agriculture-2026\dataset\train"  # folder chứa RGB/MS/HS như hình bạn gửi
RGB_DIR = os.path.join(ROOT, "RGB")
MS_DIR  = os.path.join(ROOT, "MS")
HS_DIR  = os.path.join(ROOT, "HS")

SEED = 42
VAL_RATIO = 0.2
CLASSES = ["Health", "Other", "Rust"]

def label_from_stem(stem: str):
    # label là phần trước dấu '_' đầu tiên
    # Health_Hyper_1 -> Health
    label = stem.split("_", 1)[0]
    return label if label in CLASSES else None

def stem_no_ext(fname: str):
    return os.path.splitext(fname)[0]

def list_stems(folder, exts):
    out = {}
    for f in os.listdir(folder):
        if not any(f.lower().endswith(e) for e in exts):
            continue
        stem = stem_no_ext(f)
        out[stem] = os.path.join(folder, f)
    return out

rgb = list_stems(RGB_DIR, exts=[".png", ".jpg", ".jpeg"])
ms  = list_stems(MS_DIR,  exts=[".tif", ".tiff"])
hs  = list_stems(HS_DIR,  exts=[".tif", ".tiff"])

# ✅ chỉ giữ những stem có đủ cả 3 modality
common = sorted(set(rgb) & set(ms) & set(hs))

rows = []
for stem in common:
    label = label_from_stem(stem)
    if label is None:
        continue
    rows.append({
        "stem": stem,
        "label": label,
        "rgb_path": rgb[stem],
        "ms_path": ms[stem],
        "hs_path": hs[stem],
    })

df = pd.DataFrame(rows).reset_index(drop=True)

# nếu thấy df rỗng -> thường là folder/đuôi file sai
print("Total aligned samples:", len(df))
print(df["label"].value_counts())

# ✅ Stratified split theo label
sss = StratifiedShuffleSplit(n_splits=1, test_size=VAL_RATIO, random_state=SEED)
idx = np.arange(len(df))
train_idx, val_idx = next(sss.split(idx, df["label"].values))

os.makedirs("splits", exist_ok=True)
df.to_csv("splits/samples_master.csv", index=False, encoding="utf-8-sig")
np.save("splits/train_idx.npy", train_idx)
np.save("splits/val_idx.npy", val_idx)

print("Saved:")
print(" - splits/samples_master.csv")
print(" - splits/train_idx.npy")
print(" - splits/val_idx.npy")
print("Train:", len(train_idx), "Val:", len(val_idx))


Total aligned samples: 577
label
Rust      200
Health    191
Other     186
Name: count, dtype: int64
Saved:
 - splits/samples_master.csv
 - splits/train_idx.npy
 - splits/val_idx.npy
Train: 461 Val: 116


In [3]:
import numpy as np
import pandas as pd

df = pd.read_csv("splits/samples_master.csv")
train_idx = np.load("splits/train_idx.npy")
val_idx   = np.load("splits/val_idx.npy")

df_train = df.iloc[train_idx].reset_index(drop=True)
df_val   = df.iloc[val_idx].reset_index(drop=True)

rgb_train_files = df_train["rgb_path"].tolist()
rgb_val_files   = df_val["rgb_path"].tolist()

ms_train_files  = df_train["ms_path"].tolist()
ms_val_files    = df_val["ms_path"].tolist()

hs_train_files  = df_train["hs_path"].tolist()
hs_val_files    = df_val["hs_path"].tolist()


In [6]:
print(len(rgb_train_files), len(rgb_val_files))
print(len(ms_train_files), len(ms_val_files))
print(len(hs_train_files), len(hs_val_files))

# ✅ check công bằng
assert len(rgb_train_files) == len(ms_train_files) == len(hs_train_files)
assert len(rgb_val_files)   == len(ms_val_files)   == len(hs_val_files)

461 116
461 116
461 116


In [4]:
import random, os
import numpy as np
import torch

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)


In [5]:
print(df_train["label"].value_counts())
print(df_val["label"].value_counts())


label
Rust      160
Health    152
Other     149
Name: count, dtype: int64
label
Rust      40
Health    39
Other     37
Name: count, dtype: int64
