In [1]:
import os
from pathlib import Path
import shutil
import random
import json
from PIL import Image

raw_root = Path("../data_raw/")
processed_root = Path("../data_processed")
processed_root.mkdir(exist_ok=True)

classes = sorted([d.name for d in raw_root.iterdir() if d.is_dir()])
len(classes)

47

In [2]:
import re

def clean_name(name):
    clean = name.lower()
    clean = clean.replace("(", "").replace(")", "")
    clean = clean.replace("/", "-")
    clean = clean.replace(" ", "_")
    clean = clean.replace("__", "_")
    clean = clean.replace(".", "")
    clean = re.sub(r"[^a-z0-9_]", "", clean)
    return clean

clean_names = [clean_name(c) for c in classes]
mapping = {orig: clean for orig, clean in zip(classes, clean_names)}

mapping


{'African Violet (Saintpaulia ionantha)': 'african_violet_saintpaulia_ionantha',
 'Aloe Vera': 'aloe_vera',
 'Anthurium (Anthurium andraeanum)': 'anthurium_anthurium_andraeanum',
 'Areca Palm (Dypsis lutescens)': 'areca_palm_dypsis_lutescens',
 'Asparagus Fern (Asparagus setaceus)': 'asparagus_fern_asparagus_setaceus',
 'Begonia (Begonia spp.)': 'begonia_begonia_spp',
 'Bird of Paradise (Strelitzia reginae)': 'bird_of_paradise_strelitzia_reginae',
 'Birds Nest Fern (Asplenium nidus)': 'birds_nest_fern_asplenium_nidus',
 'Boston Fern (Nephrolepis exaltata)': 'boston_fern_nephrolepis_exaltata',
 'Calathea': 'calathea',
 'Cast Iron Plant (Aspidistra elatior)': 'cast_iron_plant_aspidistra_elatior',
 'Chinese Money Plant (Pilea peperomioides)': 'chinese_money_plant_pilea_peperomioides',
 'Chinese evergreen (Aglaonema)': 'chinese_evergreen_aglaonema',
 'Christmas Cactus (Schlumbergera bridgesii)': 'christmas_cactus_schlumbergera_bridgesii',
 'Chrysanthemum': 'chrysanthemum',
 'Ctenanthe': 'c

In [3]:
with open("../models/label_mapping.json", "w") as f:
    json.dump(mapping, f, indent=4)

with open("../models/labels.txt", "w") as f:
    for cname in clean_names:
        f.write(cname + "\n")


In [4]:
for cname in clean_names:
    (processed_root / cname / "train").mkdir(parents=True, exist_ok=True)
    (processed_root / cname / "val").mkdir(parents=True, exist_ok=True)
    (processed_root / cname / "test").mkdir(parents=True, exist_ok=True)


In [5]:
# WE BALLLL p1
for orig, cname in mapping.items():
    orig_dir = raw_root / orig
    images = list(orig_dir.glob("*"))
    random.shuffle(images)
    
    n = len(images)
    n_train = int(n * 0.7)
    n_val = int(n * 0.15)
    
    train_imgs = images[:n_train]
    val_imgs = images[n_train:n_train+n_val]
    test_imgs = images[n_train+n_val:]
    
    # Copy
    for img_path in train_imgs:
        shutil.copy(img_path, processed_root / cname / "train" / img_path.name)
        
    for img_path in val_imgs:
        shutil.copy(img_path, processed_root / cname / "val" / img_path.name)
        
    for img_path in test_imgs:
        shutil.copy(img_path, processed_root / cname / "test" / img_path.name)
