# Calculate the mean and std for training dataset
# 计算训练集图像均值&方差

$$
\text{Average Mean} = \frac{{\mu_1 + \mu_2 + \ldots + \mu_N}}{N}
$$

$$
\text{Average Variance} = \frac{{\sigma_1^2 + \sigma_2^2 + \ldots + \sigma_N^2}}{N}
$$
$$
\text{Overall Standard Deviation} = \sqrt{\text{Average Variance}}
$$


In [1]:
from PIL import Image
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from concurrent.futures import as_completed


ROOT_DATA = os.environ.get('LARD_DATA_ROOT_PATH')
ROOT_PROJECT = os.environ.get('LARD_PROJECT_ROOT_PATH')

print(ROOT_DATA)
print(ROOT_PROJECT)


def compute_mean_std(image_path):
    # func for calculating mean and std 定义计算均值和标准差的函数
    img = Image.open(image_path)
    img_array = np.array(img)
    mean = np.mean(img_array, axis=(0, 1))
    std = np.std(img_array, axis=(0, 1))
    return mean, std


# init 初始化变量
total_mean = np.zeros(3)
total_std = np.zeros(3)
total_images = 0

# path to training set images 训练集图片路径
path_train_images = f'{ROOT_DATA}/YoloFormat/train/images' # the train after train_val_split, train_val_split之后的train
print(f"Path ({path_train_images}) contain files: {len(os.listdir(path_train_images))}")

# all the jpeg paths, 所有JPEG文件路径
jpeg_files = [os.path.join(path_train_images, f) for f in os.listdir(
    path_train_images) if f.endswith('.jpeg')]

print("Start")
# using multi thread to accelerate 使用多线程来加速计算
with ThreadPoolExecutor() as executor:
    # 使用 map 返回结果的 future 列表
    futures = [executor.submit(compute_mean_std, file) for file in jpeg_files]
    
    # tqdm, 使用 tqdm 创建进度条
    for future in tqdm(as_completed(futures), total=len(jpeg_files), ncols=100):
        mean, std = future.result()
        total_mean += mean
        total_std += std ** 2  # get variance sum, 计算方差和
        total_images += 1

# calculate 计算整体均值和标准差
total_mean /= total_images
total_std = np.sqrt(total_std / total_images)  # Convert to standard deviation, 将方差和转换为标准差

print(f"{total_mean=}")
print(f"{total_std=}")

/home/yeli/workspace/lard/lard-dataset
/home/yeli/workspace/lard/lard-detection
路径(/home/yeli/workspace/lard/lard-dataset/YoloFormat/train/images)下包括文件数量: 12989
开始计算


100%|█████████████████████████████████████████████████████████| 12989/12989 [06:55<00:00, 31.29it/s]

total_mean=array([121.97881021, 141.08208522, 164.55199028])
total_std=array([46.94337701, 54.84993929, 70.40161638])





In [None]:
# train_all, cost time: 14m 5.8s
# total_mean=array([122.02546603, 141.12094067, 164.56260058])
# total_std=array([46.92374909, 54.82440862, 70.39951964])

# train (80%), cost time: 9m 34.2s
# total_mean=array([122.00711516, 141.11828193, 164.56574534])
# total_std=array([46.91310377, 54.8164231 , 70.38650678])

# train (90%), cost time: 6m 56.9s
# total_mean=array([121.97881021, 141.08208522, 164.55199028])
# total_std=array([46.94337701, 54.84993929, 70.40161638])
