DMID-breast

In [None]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2

def np_CountUpContinuingOnes(b_arr):
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)
    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]
    return right - left - 1

def ExtractBreast(img):
    img_copy = img.copy()
    img = np.where(img <= 40, 0, img)
    height, _ = img.shape
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img = img[:, col_ind]
    _, width = img.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]
    return img_copy[row_ind][:, col_ind]

# 读取数据划分CSV文件
split_csv_path = "../classification_data/classification_split.csv"
split_df = pd.read_csv(split_csv_path)
# 只保留DMID-breast数据集
split_df = split_df[split_df['dataset'] == 'DMID-breast']

# 读取XLSX文件
xlsx_path = '/Volumes/图图/DMID-kaggle/archive/Metadata.xlsx'
df = pd.read_excel(xlsx_path)

# 合并相同 ID 的数据
def merge_duplicates(df):
    merged_data = {}
    for _, row in df.iterrows():
        file_id = row['ID'].strip()
        view_dir = str(row['view']).replace(' ', '')
        left_or_right = 'Left' if view_dir.endswith('LT') else 'Right'
        background_tissue = str(row['background tissue']).replace(' ', '')

        if file_id in merged_data:
            if background_tissue != '-' and merged_data[file_id]['background_tissue'] != background_tissue:
                merged_data[file_id]['background_tissue'] = background_tissue
        else:
            merged_data[file_id] = {
                'view': view_dir,
                'left_or_right': left_or_right,
                'background_tissue': background_tissue
            }

    return merged_data

# 处理后的数据
merged_data = merge_duplicates(df)
df_sorted = pd.DataFrame(list(merged_data.items()), columns=['ID', 'data'])

# 合并数据划分信息
df_sorted = pd.merge(df_sorted, split_df[['data_name', 'data_split']], 
                    left_on='ID', right_on='data_name', how='inner')

# 定义输入和输出路径
DCM_PATH = "/Volumes/图图/DMID-kaggle/archive/DICOM Images/DICOM Images"
OUTPUT_BASE_PATH = "../classification_data/DMID"

def process_and_save():
    for _, row in df_sorted.iterrows():
        file_id = row['ID'].strip()
        data = row['data']
        data_split = row['data_split']
        background_tissue = data['background_tissue']

        meta_data = {
            'Composition': background_tissue,
        }
        if meta_data['Composition'] == '-':
            del meta_data['Composition']

        dcm_file = os.path.join(DCM_PATH, f"{file_id}.dcm")
        if os.path.exists(dcm_file):
            dcm = pdcm.dcmread(dcm_file)
            img = dcm.pixel_array
            if len(img.shape) == 3 and img.shape[2] == 3:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = ExtractBreast(img)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

            output_dir = os.path.join(OUTPUT_BASE_PATH, data_split, file_id)
            os.makedirs(output_dir, exist_ok=True)

            jpg_path = os.path.join(output_dir, 'img.jpg')
            cv2.imwrite(jpg_path, img)

            npy_path = os.path.join(output_dir, 'info_dict.npy')
            np.save(npy_path, meta_data)

            print(f"Processed {file_id} for {data_split} set")
        else:
            print(f"DICOM file for {file_id} not found.")

# 处理并保存数据
process_and_save()
print("Processing complete.")

Processed IMG406 for Train set
Processed IMG223 for Train set
Processed IMG403 for Train set
Processed IMG215 for Train set
Processed IMG161 for Train set
Processed IMG008 for Train set
Processed IMG027 for Train set
Processed IMG423 for Train set
Processed IMG062 for Train set
Processed IMG046 for Train set
Processed IMG278 for Train set
Processed IMG141 for Train set
Processed IMG082 for Train set
Processed IMG264 for Train set
Processed IMG362 for Train set
Processed IMG271 for Train set
Processed IMG093 for Train set
Processed IMG419 for Train set
Processed IMG231 for Train set
Processed IMG371 for Train set
Processed IMG092 for Train set
Processed IMG038 for Train set
Processed IMG383 for Train set
Processed IMG368 for Train set
Processed IMG272 for Train set
Processed IMG067 for Train set
Processed IMG455 for Train set
Processed IMG444 for Train set
Processed IMG053 for Train set
Processed IMG460 for Train set
Processed IMG244 for Train set
Processed IMG081 for Train set
Processe

DMID-finding

In [None]:
import os
import numpy as np
import pandas as pd
import pydicom as pdcm
import cv2

def crop_and_save(img, x, y, radius, output_path):
    flag = True
    try:
        x = int(x)
        y = int(y)
        radius = int(radius)
        x1 = max(x - radius, 0)
        y1 = max(y - radius, 0)
        x2 = min(x + radius, img.shape[1])
        y2 = min(y + radius, img.shape[0])

        # 裁剪图像
        cropped_img = img[y1:y2, x1:x2]
        cropped_img = cv2.normalize(cropped_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        cv2.imwrite(output_path, cropped_img)
    except:
        flag = False
    return flag

# 读取数据划分CSV文件
split_csv_path = "../classification_data/classification_split.csv"
split_df = pd.read_csv(split_csv_path)
# 只保留DMID-finding数据集
split_df = split_df[split_df['dataset'] == 'DMID-finding']

# 读取XLSX文件
xlsx_path = '/Volumes/图图/DMID-kaggle/archive/Metadata.xlsx'
df = pd.read_excel(xlsx_path)
df = df[df['abnormality'].replace(' ', '') != 'NORM']

# 定义输入和输出路径
DCM_PATH = "/Volumes/图图/DMID-kaggle/archive/DICOM Images/DICOM Images"
OUTPUT_BASE_PATH = "../classification_data/DMID-finding"

def process_and_save():
    global_counter = {}  # 用于生成唯一后缀的计数器
    
    for index, row in df.iterrows():
        # 获取 abnormality 信息
        abnormality = str(row['abnormality']).replace(' ', '')
        pathology = str(row['pathology']).replace(' ', '')

        # 处理 pathology 字段
        if pathology == 'N':
            pathology = 'B'
        
        # 将 abnormality 中的 + 号处理为列表
        abnormality = abnormality.split('+') if '+' in abnormality else [abnormality]
        
        # 构建元数据
        info_dict = {
            'Finding': abnormality,
            'Pathology': pathology
        }
        if 'Finding' in info_dict:
            updated_categories = []
            for category in info_dict['Finding']:
                if category == 'CALC' or category == 'CLAC':
                    updated_categories.append('Calcification')
                elif category == 'CIRC':
                    updated_categories.append('Circumscribed masses')
                elif category == 'SPIC':
                    updated_categories.append('Spiculated masses')
                elif category == 'ARCH':
                    updated_categories.append('Architectural distortion')
                elif category == 'ASYM':
                    updated_categories.append('Asymmetry')
                elif category == 'MISC':
                    updated_categories.append('Miscellaneous')
            if updated_categories:
                info_dict['Finding'] = updated_categories
        
        if 'Pathology' in info_dict:
            if info_dict['Pathology'] == 'B':
                info_dict['Pathology'] = 'Benign'
            elif info_dict['Pathology'] == 'M':
                info_dict['Pathology'] = 'Malignant'

        file_id = row['ID'].strip()
        x, y, radius = row['x'], row['y'], row['radius']

        # 更新ID计数器，确保每个file_id有唯一后缀
        if file_id in global_counter:
            global_counter[file_id] += 1
        else:
            global_counter[file_id] = 1
        file_id_with_suffix = f"{file_id}_{global_counter[file_id]}"

        # 查找对应的data_split
        split_info = split_df[split_df['data_name'] == file_id_with_suffix]
        if split_info.empty:
            print(f"No split info found for {file_id_with_suffix}")
            continue
            
        data_split = split_info['data_split'].values[0]

        # 处理DICOM文件
        dcm_file = os.path.join(DCM_PATH, f"{file_id}.dcm")
        if os.path.exists(dcm_file):
            dcm = pdcm.dcmread(dcm_file)
            img = dcm.pixel_array
            if img.ndim == 3 and img.shape[2] == 3:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # 创建输出目录
            output_dir = os.path.join(OUTPUT_BASE_PATH, data_split, file_id_with_suffix)

            # 裁剪并保存预处理后的图像为JPG文件
            jpg_path = os.path.join(output_dir, 'img.jpg')
            flag = crop_and_save(img, x, y, radius, jpg_path)

            # 保存元数据为NPY文件
            if flag:
                npy_path = os.path.join(output_dir, 'info_dict.npy')
                np.save(npy_path, info_dict)
                print(f"Processed {file_id_with_suffix} for {data_split} set")
        else:
            print(f"DICOM file for {file_id} not found.")

# 处理并保存数据
process_and_save()
print("Processing complete.")

Processed IMG283 for Train set as IMG283_1
Processed IMG459 for Train set as IMG459_1
Processed IMG085 for Train set as IMG085_1
Processed IMG288 for Train set as IMG288_1
Processed IMG315 for Train set as IMG315_1
Processed IMG277 for Train set as IMG277_1
Processed IMG064 for Train set as IMG064_1
Processed IMG489 for Train set as IMG489_1
Processed IMG278 for Train set as IMG278_1
Processed IMG283 for Train set as IMG283_2
Processed IMG132 for Train set as IMG132_1
Processed IMG305 for Train set as IMG305_1
Processed IMG090 for Train set as IMG090_1
Processed IMG306 for Train set as IMG306_1
Processed IMG144 for Train set as IMG144_1
Processed IMG150 for Train set as IMG150_1
Processed IMG476 for Train set as IMG476_1
Processed IMG369 for Train set as IMG369_1
Processed IMG323 for Train set as IMG323_1
Processed IMG301 for Train set as IMG301_1
Processed IMG294 for Train set as IMG294_1
Processed IMG410 for Train set as IMG410_1
Processed IMG066 for Train set as IMG066_1
Processed I