In [None]:
import os
import pandas as pd
import pydicom
import numpy as np
import cv2
from PIL import Image
import shutil
from tqdm import tqdm

# 设置随机种子以保证可重复性
np.random.seed(42)

# 输入文件路径
boxes_csv_path = "/Volumes/Newsmy/DBT/Test/BCS-DBT-boxes-test-v2-PHASE-2-Jan-2024.csv"
paths_csv_path = "/Volumes/Newsmy/DBT/Test/BCS-DBT-file-paths-test-v2.csv"
dcm_base_path = "/Volumes/Newsmy/DBT/Test/manifest-1617905855234"
split_csv_path = "../classification_data/classification_split.csv"

# 输出文件夹
output_base = "../classification_data/DBT"
os.makedirs(output_base, exist_ok=True)

# 读取CSV文件
boxes_df = pd.read_csv(boxes_csv_path)
paths_df = pd.read_csv(paths_csv_path)
split_df = pd.read_csv(split_csv_path)

# 只保留DBT数据集
split_df = split_df[split_df['dataset'] == 'DBT']

# 为boxes_df添加唯一标识符（处理重复的PatientID+StudyUID+View组合）
boxes_df['unique_id'] = boxes_df.groupby(['PatientID', 'StudyUID', 'View']).cumcount()

# 合并两个DataFrame以获取每个box对应的dcm路径
merged_df = pd.merge(boxes_df, paths_df, on=["PatientID", "StudyUID", "View"])

# 添加完整的dcm路径
merged_df["full_dcm_path"] = dcm_base_path + "/" + merged_df["descriptive_path"]

# 创建输出目录结构
split_folders = ["Train", "Eval", "Test"]
for folder in split_folders:
    os.makedirs(os.path.join(output_base, folder), exist_ok=True)

def process_dcm_and_save(row, output_dir):
    try:
        # 读取DICOM文件
        dcm = pydicom.dcmread(row["full_dcm_path"])
        
        # 获取像素数据
        pixel_array = dcm.pixel_array
        
        # 检查是否是3D数据
        if len(pixel_array.shape) != 3:
            print(f"Skipping {row['full_dcm_path']} - not 3D data")
            return False
        
        # 获取指定切片
        slice_idx = row["Slice"] - 1  # 转换为0-based索引
        if slice_idx >= pixel_array.shape[0]:
            print(f"Skipping {row['full_dcm_path']} - slice {row['Slice']} out of range")
            return False
        
        slice_img = pixel_array[slice_idx]
        
        # 转换为8位无符号整数 (0-255)
        if slice_img.dtype != np.uint8:
            slice_img = cv2.normalize(slice_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        
        # 转换为PIL图像
        img = Image.fromarray(slice_img)
        
        # 裁剪ROI
        x, y, width, height = row["X"], row["Y"], row["Width"], row["Height"]
        roi = img.crop((x, y, x + width, y + height))
        
        # 从路径中提取唯一标识符
        path_parts = row['descriptive_path'].split('/')
        study_id = path_parts[-2].split('-')[-1] if len(path_parts) >= 2 else "unknown"
        
        # 创建唯一输出文件夹名称 (包含unique_id以区分相同PatientID+StudyUID+View的组合)
        data_name = f"{row['Class']}_{study_id}_{row['unique_id']}"
        
        # 查找对应的data_split
        split_info = split_df[split_df['data_name'] == data_name]
        if split_info.empty:
            print(f"No split info found for {data_name}")
            return False
        
        data_split = split_info['data_split'].values[0]
        output_folder = os.path.join(output_dir, data_split, data_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # 保存图像
        roi.save(os.path.join(output_folder, "img.jpg"))
        
        # 创建并保存更详细的info_dict.npy
        pathology = "Malignant" if row["Class"] == "cancer" else "Benign"
        metadata = {
            "Pathology": pathology
        }
        np.save(os.path.join(output_folder, "info_dict.npy"), metadata)
        
        return True
    
    except Exception as e:
        print(f"Error processing {row['full_dcm_path']}: {str(e)}")
        return False

# 处理所有图像并直接保存到对应的分割目录
processed_count = 0
for _, row in tqdm(merged_df.iterrows(), total=len(merged_df)):
    if process_dcm_and_save(row, output_base):
        processed_count += 1

print(f"Processing completed successfully! Total processed: {processed_count}")

 54%|█████▍    | 74/136 [16:28<21:39, 20.96s/it]

Error processing /Volumes/Newsmy/DBT/Test/manifest-1617905855234/Breast-Cancer-Screening-DBT/DBT-P02308/01-01-2000-DBT-S00377-MAMMO diagnostic digital bilateral-02419/5940.000000-66346/1-1.dcm: 'FileDataset' object has no attribute 'PixelData'


100%|██████████| 136/136 [30:45<00:00, 13.57s/it]


Processing completed successfully! Total processed: 135
