In [None]:
import os
import pydicom
import numpy as np
import cv2
from PIL import Image
import shutil
from tqdm import tqdm
import pandas as pd

def np_CountUpContinuingOnes(b_arr):
    """Calculate the length of consecutive ones"""
    left = np.arange(len(b_arr))
    left[b_arr > 0] = 0
    left = np.maximum.accumulate(left)

    rev_arr = b_arr[::-1]
    right = np.arange(len(rev_arr))
    right[rev_arr > 0] = 0
    right = np.maximum.accumulate(right)
    right = len(rev_arr) - 1 - right[::-1]

    return right - left - 1

def ExtractBreast(img_array):
    """Breast region extraction (input: numpy array)"""
    img_copy = img_array.copy()
    img_array = np.where(img_array <= 40, 0, img_array)
    height, _ = img_array.shape

    # Horizontal cropping
    y_a = height // 2 + int(height * 0.4)
    y_b = height // 2 - int(height * 0.4)
    b_arr = img_array[y_b:y_a].std(axis=0) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    col_ind = np.where(continuing_ones == continuing_ones.max())[0]
    img_array = img_array[:, col_ind]

    # Vertical cropping
    _, width = img_array.shape
    x_a = width // 2 + int(width * 0.4)
    x_b = width // 2 - int(width * 0.4)
    b_arr = img_array[:, x_b:x_a].std(axis=1) != 0
    continuing_ones = np_CountUpContinuingOnes(b_arr)
    row_ind = np.where(continuing_ones == continuing_ones.max())[0]

    return img_copy[row_ind][:, col_ind]

def parse_dcm_info(filepath):
    """Extract metadata from file path and remove Unknown key-value pairs"""
    parts = filepath.split(os.sep)
    metadata = {}
    
    # Pathology
    if 'Bening' in parts:
        metadata["Pathology"] = "Benign"
    elif 'maling' in parts:
        metadata["Pathology"] = "Malignant"
    elif 'Normal' in parts:
        metadata["Pathology"] = "Benign"
    
    # Bi-RADS
    if 'BIRADS II' in parts:
        metadata["Bi-Rads"] = "Bi-Rads 2"
    elif 'BIRADS IV' in parts:
        metadata["Bi-Rads"] = "Bi-Rads 4"
    elif 'BIRADS V' in parts:
        metadata["Bi-Rads"] = "Bi-Rads 5"
    elif 'BARADS I' in parts:
        metadata["Bi-Rads"] = "Bi-Rads 1"
    
    # ACR Level
    if 'ACR1' in parts:
        metadata["Composition"] = "Level A"
    elif 'ACR2' in parts:
        metadata["Composition"] = "Level B"
    elif 'ACR3' in parts:
        metadata["Composition"] = "Level C"
    elif 'ACR4' in parts:
        metadata["Composition"] = "Level D"
    
    return metadata

def process_dcm_file(dcm_path, output_base, split_info_df):
    try:
        # Read DICOM file
        dcm = pydicom.dcmread(dcm_path)
        
        # Get pixel array
        img_array = dcm.pixel_array
        if len(img_array.shape) > 2:
            img_array = img_array.mean(axis=2)
        
        # Convert 16-bit to 8-bit (0-255) if needed
        img_array = cv2.normalize(img_array, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

        # Extract breast region
        breast_img = ExtractBreast(img_array)
        
        # Convert to 8-bit
        if breast_img.dtype != np.uint8:
            breast_img = cv2.normalize(breast_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
        
        # Create output directory structure
        filename = os.path.basename(dcm_path)
        patient_id = filename.split('_')[1]  # Extract P118 from filename
        study_id = filename.split('_')[2]    # Extract 81329 from filename
        laterality = "L" if "_L_" in filename else "R" if "_R_" in filename else "U"
        view = "CC" if "CC" in filename else "MLO" if "MLO" in filename or "LMO" in filename or "ML" in filename else "UNK"
        
        data_name = f"{patient_id}_{study_id}_{laterality}_{view}"
        
        # Get split information from CSV
        split_info = split_info_df[split_info_df['data_name'] == data_name]
        if split_info.empty:
            print(f"No split info found for {data_name}")
            return False
            
        data_split = split_info['data_split'].values[0]
        output_dir = os.path.join(output_base, data_split, data_name)
        os.makedirs(output_dir, exist_ok=True)
        
        # Save image
        img_pil = Image.fromarray(breast_img)
        img_pil.save(os.path.join(output_dir, "img.jpg"))
        
        # Save metadata (after removing Unknown values)
        metadata = parse_dcm_info(dcm_path)
        np.save(os.path.join(output_dir, "info_dict.npy"), metadata)
        
        return True
        
    except Exception as e:
        print(f"Error processing {dcm_path}: {str(e)}")
        return False

def main():
    # Input and output paths
    input_base = "/Volumes/Newsmy/LAMISDMDB/LAMISDMDB VF"
    output_base = "../classification_data/LAMIS"
    os.makedirs(output_base, exist_ok=True)
    
    # Load split information
    split_csv_path = "../classification_data/classification_split.csv"
    split_info_df = pd.read_csv(split_csv_path)
    split_info_df = split_info_df[split_info_df['dataset'] == 'LAMIS']
    
    # Find all DICOM files
    dcm_files = []
    for root, dirs, files in os.walk(input_base):
        for file in files:
            if file.endswith('.dcm'):
                dcm_files.append(os.path.join(root, file))
    
    print(f"Found {len(dcm_files)} DICOM files to process")
    
    # Process all files
    success_count = 0
    for dcm_file in tqdm(dcm_files):
        if process_dcm_file(dcm_file, output_base, split_info_df):
            success_count += 1
    
    print(f"Processing completed. Successfully processed {success_count}/{len(dcm_files)} files")

if __name__ == "__main__":
    main()

Found 2216 DICOM files to process


 29%|██▊       | 637/2216 [05:02<09:49,  2.68it/s]

Error processing /Volumes/Newsmy/LAMISDMDB/LAMISDMDB VF/Bening/BIRADS II/Mass/ACR2/dataMG-PROC_P381_457_R_CC_11_February_2019.dcm: The length of the pixel data in the dataset (30199006 bytes) doesn't match the expected length (30200880 bytes). The dataset may be corrupted or there may be an issue with the pixel data handler.


 39%|███▉      | 866/2216 [06:49<10:51,  2.07it/s]

Error processing /Volumes/Newsmy/LAMISDMDB/LAMISDMDB VF/Bening/BIRADS II/Mass/ACR3/dataMG-PROC_P393_793_L_CC_12_November_2018.dcm: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.


 83%|████████▎ | 1836/2216 [14:32<03:12,  1.98it/s]

Error processing /Volumes/Newsmy/LAMISDMDB/LAMISDMDB VF/Normal/BARADS I/ACR2/dataMG-PROC_P441_689_L_MLO_27_November_2018.dcm: File is missing DICOM File Meta Information header or the 'DICM' prefix is missing from the header. Use force=True to force reading.


100%|██████████| 2216/2216 [17:41<00:00,  2.09it/s]

Processing completed. Successfully processed 2213/2216 files



