In [None]:
import os
import numpy as np
from tqdm import tqdm
from PIL import Image
import pandas as pd

# 路径配置
original_path = "/Volumes/Newsmy/MM/Breast Cancer Dataset/Original Dataset"
new_dataset_path = "../classification_data/MM"
split_csv_path = "../classification_data/classification_split.csv"

def np_CountUpContinuingOnes(b_arr):
    """计算连续1的区间长度"""
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)

    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]

    return right - left - 1

def ExtractBreast(img_array):
    """乳腺区域提取（输入为numpy数组）"""
    img_copy = img_array.copy()
    img_array = np.where(img_array <= 40, 0, img_array)
    height, _ = img_array.shape

    # 水平方向裁剪
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img_array[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img_array = img_array[:, col_ind]

    # 垂直方向裁剪
    _, width = img_array.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img_array[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]

    return img_copy[row_ind][:, col_ind]

def process_image(img_path):
    """处理单个图像文件"""
    img = Image.open(img_path).convert('L')
    img_array = np.array(img)
    
    if img_array.size == 0:
        raise ValueError("Empty image array")
        
    processed_array = ExtractBreast(img_array)
    return Image.fromarray(processed_array)

def process_dataset():
    # 读取数据划分CSV文件
    split_df = pd.read_csv(split_csv_path)
    split_df = split_df[split_df['dataset'] == 'MM']
    
    global_idx = 1
    
    for class_name in ["Cancer", "Non-Cancer"]:
        pathology_label = "Malignant" if class_name == "Cancer" else "Benign"
        src_dir = os.path.join(original_path, class_name)
        
        # 获取并过滤有效图像文件
        files = [f for f in os.listdir(src_dir) 
                if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        
        for filename in tqdm(files, desc=f"Processing {class_name}"):
            img_path = os.path.join(src_dir, filename)
            if img_path.split('/')[-1].startswith('IMG'):
                processed_img = process_image(img_path)
                
                if processed_img is not None:
                    # 创建样本名称
                    sample_name = f"{class_name}_img{global_idx}"
                    
                    # 查找对应的data_split
                    split_info = split_df[split_df['data_name'] == sample_name]
                    if split_info.empty:
                        print(f"No split info found for {sample_name}")
                        continue
                    
                    data_split = split_info['data_split'].values[0]
                    
                    # 创建样本目录
                    sample_path = os.path.join(new_dataset_path, data_split, sample_name)
                    os.makedirs(sample_path, exist_ok=True)
                    
                    # 保存处理后的图像
                    processed_img.save(os.path.join(sample_path, "img.jpg"))
                    
                    # 保存元数据
                    info_dict = {
                        "Pathology": pathology_label
                    }
                    np.save(os.path.join(sample_path, "info_dict.npy"), info_dict)
                    
                    global_idx += 1

if __name__ == "__main__":
    # 清空并重建输出目录
    if os.path.exists(new_dataset_path):
        import shutil
        shutil.rmtree(new_dataset_path)
    os.makedirs(new_dataset_path)
    
    # 处理数据集
    process_dataset()
    
    # 统计结果
    for split in ["Train", "Eval", "Test"]:
        split_path = os.path.join(new_dataset_path, split)
        if os.path.exists(split_path):
            count = len(os.listdir(split_path))
            print(f"{split} set: {count} samples")
        else:
            print(f"{split} set: 0 samples (directory not found)")
    
    print(f"Processing complete. Results saved to: {new_dataset_path}")

Cancer -> Train: 100%|██████████| 175/175 [00:08<00:00, 20.11it/s]
Cancer -> Eval: 100%|██████████| 25/25 [00:02<00:00, 11.69it/s]
Cancer -> Test: 100%|██████████| 50/50 [00:02<00:00, 17.71it/s]
Non-Cancer -> Train: 100%|██████████| 868/868 [00:45<00:00, 19.05it/s]
Non-Cancer -> Eval: 100%|██████████| 124/124 [00:05<00:00, 21.31it/s]
Non-Cancer -> Test: 100%|██████████| 248/248 [00:14<00:00, 16.60it/s]

Train set: 1022 samples
Eval set: 146 samples
Test set: 322 samples
Processing complete. Results saved to: /Volumes/Newsmy/MM/Breast Cancer Dataset/MM



