seg&det_split

In [None]:
import os
import pandas as pd
import numpy as np
from skimage.draw import polygon
import cv2
import pydicom as pdcm
import math
import xml.etree.ElementTree as ET

class Annotation:
    def __init__(self, xml_path, csv_path, filename, shape):
        self.xml_path = xml_path + filename + '.xml'
        self.csv_path = csv_path
        self.filename = filename
        self.shape = shape
        self.mask_mass = self.create_mask_array(shape)
        self.fill_mask()
        self.bboxes = self.load_bboxes_from_csv()

    def fill_mask(self):
        if os.path.exists(self.xml_path):
            rois, num_rois = self.parse_XML(self.xml_path)

            for roi in rois:
                roi_info = self.get_roi_info(roi)
                r_poly, c_poly = self.create_polygon_lists(self.mask_mass, roi_info["points"])
                rr, cc = polygon(r_poly, c_poly)
                try:
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1  # Mass mask
                except IndexError:
                    valid_idx = (rr >= 0) & (rr < self.mask_mass.shape[0]) & (cc >= 0) & (cc < self.mask_mass.shape[1])
                    rr = rr[valid_idx]
                    cc = cc[valid_idx]
                    if roi_info["roi_type"] == "Mass":
                        self.mask_mass[rr, cc] = 1
                    print('out of bound:', self.xml_path)

    def parse_XML(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()  # The root of the XML file
        data = root[0][1]  # The essential info
        rois = data[0][5]  # Array containing the ROI objects
        num_of_rois = int(data[0][3].text)  # Number of ROI objects
        return rois, num_of_rois

    def create_mask_array(self, img_shape):
        return np.zeros((img_shape[0], img_shape[1]), dtype=np.uint8)

    def get_roi_info(self, roi):
        roi_info = {
            "points": roi[21],  # Array containing the points of a ROI
            "num_of_points": int(roi[17].text),  # Number of points of the area
            "roi_index": int(roi[7].text),  # Identifier of the ROI
            "roi_type": roi[15].text  # (Mass, Calcification, other)
        }
        return roi_info

    def create_polygon_lists(self, mask, points):
        r_poly = np.array([])
        c_poly = np.array([])

        for point in points:
            temp_tuple = point.text[1:-1].split(",")
            y = int(math.trunc(float(temp_tuple[0])))
            x = int(math.trunc(float(temp_tuple[1])))
            r_poly = np.append(r_poly, x)
            c_poly = np.append(c_poly, y)

        return r_poly, c_poly

    def load_bboxes_from_csv(self):
        bboxes = []
        if os.path.exists(self.csv_path):
            df = pd.read_csv(self.csv_path)
            file_bboxes = df[df['File Name'] == np.int64(self.filename)]
            for _, row in file_bboxes.iterrows():
                x1 = row['X']
                y1 = row['Y']
                x2 = x1 + row['W']
                y2 = y1 + row['H']
                bboxes.append([x1, y1, x2, y2])
        return bboxes

    def np_CountUpContinuingOnes(self, b_arr):
        left = np.arange(len(b_arr))
        left[b_arr > 0] = 0
        left = np.maximum.accumulate(left)

        rev_arr = b_arr[::-1]
        right = np.arange(len(rev_arr))
        right[rev_arr > 0] = 0
        right = np.maximum.accumulate(right)
        right = len(rev_arr) - 1 - right[::-1]

        return right - left - 1

    def adjust_bounding_box(self, original_coords, left_crop, top_crop):
        x1, y1, x2, y2 = original_coords
        x1_new = x1 - left_crop
        y1_new = y1 - top_crop
        x2_new = x2 - left_crop
        y2_new = y2 - top_crop
        return x1_new, y1_new, x2_new, y2_new

    def ExtractBreast(self, img, mask, true_bounding_boxes):
        img_copy = img.copy()
        mask_copy = mask.copy()
        img = np.where(img <= 20, 0, img)
        height, _ = img.shape

        y_a = height // 2 + int(height * 0.4)
        y_b = height // 2 - int(height * 0.4)
        b_arr = img[y_b:y_a].std(axis=0) != 0
        continuing_ones = self.np_CountUpContinuingOnes(b_arr)
        col_ind = np.where(continuing_ones == continuing_ones.max())[0]
        img = img[:, col_ind]
        mask = mask[:, col_ind]

        _, width = img.shape
        x_a = width // 2 + int(width * 0.4)
        x_b = width // 2 - int(width * 0.4)
        b_arr = img[:, x_b:x_a].std(axis=1) != 0
        continuing_ones = self.np_CountUpContinuingOnes(b_arr)
        row_ind = np.where(continuing_ones == continuing_ones.max())[0]
        img = img[row_ind, :]
        mask = mask[row_ind, :]

        adjusted_bboxes = []
        for bbox in true_bounding_boxes:
            adjusted_coords = self.adjust_bounding_box(bbox, col_ind[0], row_ind[0])
            adjusted_bboxes.append(adjusted_coords)

        return img_copy[row_ind][:, col_ind], mask_copy[row_ind][:, col_ind], adjusted_bboxes

    def save_data(self, img, mask, bboxes, output_dir, img_name):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        jpg_path = os.path.join(output_dir, f'img.jpg')
        cv2.imwrite(jpg_path, img)

        png_path = os.path.join(output_dir, f'mask.png')
        cv2.imwrite(png_path, mask * 255)

        npy_path = os.path.join(output_dir, f'bboxes.npy')
        np.save(npy_path, np.array(bboxes))

        print(f'Data saved for {img_name} in {output_dir}')

    def save_mask_and_image(self, dcm_path, output_base_path, img_name, data_split):
        dcm = pdcm.dcmread(dcm_path)
        img = dcm.pixel_array
        processed_img, self.mask_mass, processed_bboxes = self.ExtractBreast(img, self.mask_mass, self.bboxes)
        img = cv2.normalize(processed_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        
        if np.any(self.mask_mass == 1):
            output_dir = os.path.join(output_base_path, data_split, img_name)
            self.save_data(img, self.mask_mass, processed_bboxes, output_dir, img_name)

split_csv_path = "../segdetdata/segdet_split.csv"
split_df = pd.read_csv(split_csv_path)
split_df = split_df[split_df['dataset'] == 'INbreast']

# 路径设置
XML_PATH = "/Volumes/图图/INbreast/INbreast/AllXML/"
CSV_PATH = "/Volumes/图图/INBreast/INbreast/BoundingBoxes_Mass_Classes_2.csv"
DCM_PATH = "/Volumes/图图/INbreast/INbreast/AllDICOMs/"
OUTPUT_BASE_PATH = "../segdetdata/INbreast"

dicom_files = [f for f in os.listdir(DCM_PATH) if f.endswith('.dcm') and not f.startswith('._')]
processed_count = 0

for filename in dicom_files:
    img_name = filename.split('.')[0]
    
    split_info = split_df[split_df['data_name'] == img_name]
    if split_info.empty:
        print(f"No split info found for {img_name}")
        continue
    
    data_split = split_info['data_split'].values[0]
    dcm_path = os.path.join(DCM_PATH, filename)
    
    try:
        dcm = pdcm.dcmread(dcm_path)
        img = dcm.pixel_array
        annotation = Annotation(XML_PATH, CSV_PATH, img_name, img.shape)
        annotation.save_mask_and_image(dcm_path, OUTPUT_BASE_PATH, img_name, data_split)
        processed_count += 1
    except Exception as e:
        print(f"Error processing {img_name}: {str(e)}")

print(f"Dataset processing completed. Total processed: {processed_count}")

Mask, image, and bounding boxes saved for 24055483 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/24055483
Mask, image, and bounding boxes saved for 24065289 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/24065289
Mask, image, and bounding boxes saved for 51049107 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/51049107
out of bound: /Volumes/图图/INbreast/INbreast/AllXML/22670620.xml
Mask, image, and bounding boxes saved for 22670620 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/22670620
Mask, image, and bounding boxes saved for 24065251 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/24065251
Mask, image, and bounding boxes saved for 20587902 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/20587902
Mask, image, and bounding boxes saved for 22614568 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/22614568
Mask, image, and bounding boxes saved for 20588046 in /Volumes/图图/INbreast/INbreast/seg_det_split/train/20588046
Mask, image, and bounding boxes 

Classification

In [None]:
import os
import pandas as pd
import numpy as np
import pydicom as pdcm
import cv2

def np_CountUpContinuingOnes(b_arr):
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)
    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]
    return right - left - 1

def ExtractBreast(img):
    img_copy = img.copy()
    img = np.where(img <= 20, 0, img)
    height, _ = img.shape
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img = img[:, col_ind]
    _, width = img.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]
    return img_copy[row_ind][:, col_ind]

# 读取数据划分CSV文件
split_csv_path = "../classification_data/classification_split.csv"
split_df = pd.read_csv(split_csv_path)
# 只保留INbreast数据集
split_df = split_df[split_df['dataset'] == 'INbreast']

# 读取XLS文件
xls_path = '/Volumes/图图/INBreast/INbreast/INbreast.xls'
df = pd.read_excel(xls_path)

# 合并数据划分信息
df = pd.merge(df, split_df[['data_name', 'data_split']], 
             left_on=df['File Name'].apply(lambda x: str(x).split('.')[0]), 
             right_on='data_name', how='inner')

# 定义输入和输出路径
DCM_PATH = "/Volumes/图图/INBreast/INbreast/AllDICOMs"
OUTPUT_BASE_PATH = "../classification_data/INbreast"

def process_and_save():
    for index, row in df.iterrows():
        file_name = str(row['File Name']).split('.')[0]
        data_split = row['data_split']
        
        # 处理元数据
        meta_data = {
            'Composition': str(row['ACR']).replace(' ',''),
            'Bi-Rads': str(row['Bi-Rads']).replace(' ','')
        }
        
        # 处理Composition
        if meta_data['Composition'] not in ['1', '2', '3', '4']:
            del meta_data['Composition']
        elif 'Composition' in meta_data:
            acr_map = {'1': 'A', '2': 'B', '3': 'C', '4': 'D'}
            meta_data['Composition'] = 'Level ' + acr_map.get(meta_data['Composition'], meta_data['Composition'])
        
        # 处理Bi-Rads
        if 'Bi-Rads' in meta_data:
            if meta_data['Bi-Rads'] in ['4a', '4b', '4c']:
                meta_data['Bi-Rads'] = 'Bi-Rads 4'
            elif meta_data['Bi-Rads'] == '6':
                del meta_data['Bi-Rads']
            elif meta_data['Bi-Rads'] in ['1', '2', '3', '5']:
                meta_data['Bi-Rads'] = f"Bi-Rads {meta_data['Bi-Rads']}"
        
        # 处理异常发现
        abnormality_list = []
        if str(row['Mass']).strip() == 'X':
            abnormality_list.append('Mass')
        if str(row['Micros']).strip() == 'X':
            abnormality_list.append('Calcification')
        if str(row['Distortion']).strip() == 'X':
            abnormality_list.append('Architectural distortion')
        if str(row['Asymmetry']).strip() == 'X':
            abnormality_list.append('Asymmetry')
        if not abnormality_list:
            abnormality_list.append('Normal')
        
        if abnormality_list:
            meta_data['Finding'] = abnormality_list

        # 读取和处理DICOM文件
        dcm_path = os.path.join(DCM_PATH, file_name + '.dcm')
        if os.path.exists(dcm_path):
            dcm = pdcm.dcmread(dcm_path)
            img = dcm.pixel_array
            img = ExtractBreast(img)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

            # 保存图像和元数据
            img_output_path = os.path.join(OUTPUT_BASE_PATH, data_split, file_name)
            os.makedirs(img_output_path, exist_ok=True)
            
            jpg_path = os.path.join(img_output_path, 'img.jpg')
            cv2.imwrite(jpg_path, img)
            
            npy_path = os.path.join(img_output_path, 'info_dict.npy')
            np.save(npy_path, meta_data)
            
            print(f"Processed {file_name} for {data_split} set")
        else:
            print(f"DICOM file for {file_name} not found.")

# 处理并保存数据
process_and_save()
print("Processing complete.")

Processed 22580732 for Train set
Processed 51048972 for Train set
Processed 50998059 for Train set
Processed 24055382 for Train set
Processed 22580706 for Train set
Processed 50997624 for Train set
Processed 22670094 for Train set
Processed 24065461 for Train set
Processed 50995762 for Train set
Processed 24055355 for Train set
Processed 50998177 for Train set
Processed 50994354 for Train set
Processed 24065407 for Train set
Processed 22678833 for Train set
Processed 53580804 for Train set
Processed 24065584 for Train set
Processed 50995899 for Train set
Processed 50997796 for Train set
Processed 50998634 for Train set
Processed 53581796 for Train set
Processed 30011850 for Train set
Processed 50996709 for Train set
Processed 53587663 for Train set
Processed 50994841 for Train set
Processed 53581237 for Train set
Processed 22427751 for Train set
Processed 50996854 for Train set
Processed 50999094 for Train set
Processed 50994327 for Train set
Processed 50999432 for Train set
Processed 