In [None]:
import os
import numpy as np
import pandas as pd
import pydicom
from PIL import Image
import cv2
from tqdm import tqdm

# 设置路径
metadata_path = "/Volumes/Newsmy/NLBSD/NLBS Data/NLBSP-metadata.csv"
base_data_path = "/Volumes/Newsmy/NLBSD/NLBS Data"
output_base = "../classification_data/NLBS"
split_csv_path = "../classification_data/classification_split.csv"

# 创建输出目录
os.makedirs(output_base, exist_ok=True)

# 定义乳腺提取函数
def np_CountUpContinuingOnes(b_arr):
    """Calculate the length of consecutive ones"""
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)

    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]

    return right - left - 1

def ExtractBreast(img_array):
    """Breast region extraction (input: numpy array)"""
    img_copy = img_array.copy()
    img_array = np.where(img_array <= 40, 0, img_array)
    height, _ = img_array.shape

    # Horizontal cropping
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img_array[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img_array = img_array[:, col_ind]

    # Vertical cropping
    _, width = img_array.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img_array[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]

    return img_copy[row_ind][:, col_ind]

# 读取元数据
metadata_df = pd.read_csv(metadata_path)
split_df = pd.read_csv(split_csv_path)
split_df = split_df[split_df['dataset'] == 'NLBS']

# 统一文件路径中的斜杠方向并添加完整路径
metadata_df['File Path'] = metadata_df['File Path'].str.replace('\\', '/')
metadata_df['full_path'] = base_data_path + '/' + metadata_df['File Path']

# 处理每个DICOM文件
for idx, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
    try:
        # 读取DICOM文件
        dcm = pydicom.dcmread(row['full_path'])
        img_array = dcm.pixel_array
        
        # 转换为8位无符号整数 (0-255)
        if img_array.dtype != np.uint8:
            img_array = cv2.normalize(img_array, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        
        # 提取乳腺区域
        breast_img = ExtractBreast(img_array)
        
        # 创建输出文件夹名称
        file_path = row['File Path']
        parts = [p for p in file_path.split('/') if p]  # 分割路径并过滤空字符串
        study_id = parts[1] if len(parts) > 1 else "unknown"
        laterality = row['Image Laterality']
        view = row['View Position']
        
        data_name = f"{parts[0]}-{study_id}-{laterality.lower()}-{view}"
        
        # 查找对应的data_split
        split_info = split_df[split_df['data_name'] == data_name]
        if split_info.empty:
            print(f"No split info found for {data_name}")
            continue
            
        data_split = split_info['data_split'].values[0]
        output_folder = os.path.join(output_base, data_split, data_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # 保存处理后的图像
        img_pil = Image.fromarray(breast_img)
        img_pil.save(os.path.join(output_folder, "img.jpg"))
        
        # 创建info_dict
        info_dict = {}
        
        # 确定病理结果
        if row['Cancer'] == 1 and row['False Positive'] == 0:
            info_dict['Pathology'] = 'Malignant'
        elif row['Cancer'] == 0 and row['False Positive'] in [0, 1]:
            info_dict['Pathology'] = 'Benign'
        
        # 保存info_dict
        np.save(os.path.join(output_folder, "info_dict.npy"), info_dict)
        
    except Exception as e:
        print(f"Error processing {row['full_path']}: {str(e)}")
        continue

print("Processing completed!")

 15%|█▍        | 4032/26988 [15:36<1:30:48,  4.21it/s]

Error processing /Volumes/Newsmy/NLBSD/NLBS Data/False Positive/2_B859411425019/right/MLO/IM-0079-0003-0001.dcm: The length of the pixel data in the dataset (3330668 bytes) doesn't match the expected length (14660856 bytes). The dataset may be corrupted or there may be an issue with the pixel data handler.
Error processing /Volumes/Newsmy/NLBSD/NLBS Data/False Positive/2_B859442525506/left/CC/IM-0081-0002-0001.dcm: [Errno 2] No such file or directory: '/Volumes/Newsmy/NLBSD/NLBS Data/False Positive/2_B859442525506/left/CC/IM-0081-0002-0001.dcm'
Error processing /Volumes/Newsmy/NLBSD/NLBS Data/False Positive/2_B859442525506/left/MLO/IM-0083-0004-0001.dcm: [Errno 2] No such file or directory: '/Volumes/Newsmy/NLBSD/NLBS Data/False Positive/2_B859442525506/left/MLO/IM-0083-0004-0001.dcm'
Error processing /Volumes/Newsmy/NLBSD/NLBS Data/False Positive/2_B859442525506/right/CC/IM-0080-0001-0001.dcm: [Errno 2] No such file or directory: '/Volumes/Newsmy/NLBSD/NLBS Data/False Positive/2_B8594

100%|██████████| 26988/26988 [1:56:01<00:00,  3.88it/s]

Processing completed!



