In [78]:
import os
import pandas as pd
import pydicom
import matplotlib.pyplot as plt
from io import BytesIO
import json
import boto3
from sklearn.model_selection import train_test_split

In [79]:
import boto3

s3 = boto3.client('s3')
bucket_name = 'ella-dlbiomarkers'
prefix = 'CMMD-D2/'  # or adjust to your specific path

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
for obj in response.get('Contents', []):
    print(obj['Key'])


CMMD-D2/D2-0001_1-1.dcm
CMMD-D2/D2-0001_1-2.dcm
CMMD-D2/D2-0001_1-3.dcm
CMMD-D2/D2-0001_1-4.dcm
CMMD-D2/D2-0002_1-1.dcm
CMMD-D2/D2-0002_1-2.dcm
CMMD-D2/D2-0002_1-3.dcm
CMMD-D2/D2-0002_1-4.dcm
CMMD-D2/D2-0003_1-1.dcm
CMMD-D2/D2-0003_1-2.dcm
CMMD-D2/D2-0003_1-3.dcm
CMMD-D2/D2-0003_1-4.dcm
CMMD-D2/D2-0004_1-1.dcm
CMMD-D2/D2-0004_1-2.dcm
CMMD-D2/D2-0004_1-3.dcm
CMMD-D2/D2-0004_1-4.dcm
CMMD-D2/D2-0005_1-1.dcm
CMMD-D2/D2-0005_1-2.dcm
CMMD-D2/D2-0005_1-3.dcm
CMMD-D2/D2-0005_1-4.dcm
CMMD-D2/D2-0006_1-1.dcm
CMMD-D2/D2-0006_1-2.dcm
CMMD-D2/D2-0006_1-3.dcm
CMMD-D2/D2-0006_1-4.dcm
CMMD-D2/D2-0007_1-1.dcm
CMMD-D2/D2-0007_1-2.dcm
CMMD-D2/D2-0007_1-3.dcm
CMMD-D2/D2-0007_1-4.dcm
CMMD-D2/D2-0008_1-1.dcm
CMMD-D2/D2-0008_1-2.dcm
CMMD-D2/D2-0008_1-3.dcm
CMMD-D2/D2-0008_1-4.dcm
CMMD-D2/D2-0009_1-1.dcm
CMMD-D2/D2-0009_1-2.dcm
CMMD-D2/D2-0009_1-3.dcm
CMMD-D2/D2-0009_1-4.dcm
CMMD-D2/D2-0010_1-1.dcm
CMMD-D2/D2-0010_1-2.dcm
CMMD-D2/D2-0010_1-3.dcm
CMMD-D2/D2-0010_1-4.dcm
CMMD-D2/D2-0011_1-1.dcm
CMMD-D2/D2-0011_

In [80]:
# Load configuration file
with open('config.json') as config_file:
    config = json.load(config_file)

In [81]:
# Configuration settings
bucket_name = config["s3_bucket_name"]
file_path_excel = config["file_path_excel"]
data_percentage = config.get("data_percentage", 1)  # Default to 100% if not set
train_test_split_percentage = config.get("train_test_split_percentage", 0.8)  # Default to 80% train, 20% test
output_excel_name_train = config.get("output_excel_name_train", "train_labels.xlsx")
output_excel_name_test = config.get("output_excel_name_test", "test_labels.xlsx")
views = config.get("views", ["CC", "MLO", "US"])

# Load metadata from S3 directly
obj = s3.get_object(Bucket=bucket_name, Key=file_path_excel)
metadata = pd.read_excel(BytesIO(obj['Body'].read()))

# Sample a subset of the data based on data_percentage
metadata = metadata.sample(frac=data_percentage, random_state=42).reset_index(drop=True)

# Split metadata into train and test sets
train_metadata_df, test_metadata_df = train_test_split(metadata, train_size=train_test_split_percentage, random_state=42)

# Map view types to exam IDs
view_map = {
    'L_CC': '1-1',
    'L_MLO': '1-2',
    'R_CC': '1-3',
    'R_MLO': '1-4'
}


In [82]:
def convert_dcm_to_png_s3(dcm_s3_key, s3_output_key):
    """Convert a DICOM file from S3 to PNG and upload to S3."""
    try:
        obj = s3.get_object(Bucket=bucket_name, Key=dcm_s3_key)
        dcm = pydicom.dcmread(BytesIO(obj['Body'].read()))
        pixel_array = dcm.pixel_array
        with BytesIO() as png_buffer:
            plt.imsave(png_buffer, pixel_array, cmap='gray', format='png')
            png_buffer.seek(0)
            s3.put_object(Bucket=bucket_name, Key=s3_output_key, Body=png_buffer)
    except s3.exceptions.NoSuchKey:
        print(f"File not found: {dcm_s3_key}")
    except Exception as e:
        print(f"Error converting {dcm_s3_key}: {e}")


In [83]:
def process_metadata(metadata, output_prefix):
    """Process metadata and save PNGs and metadata entries to S3."""
    processed_data = []
    for _, row in metadata.iterrows():
        patient_id = row['patient_id']
        subtype = row['subtype']
        
        for view in views:
            exam_id = view_map.get(view)
            if not exam_id:
                continue
            
            # Construct S3 key for the DICOM file with patient ID and exam ID
            dcm_s3_key = f"CMMD-D2/{patient_id}_{exam_id}.dcm"
            s3_output_png_key = f"{output_prefix}/PNGs/{patient_id}_{view}.png"  # Include view for clarity
            
            # Convert DICOM to PNG and upload to S3
            convert_dcm_to_png_s3(dcm_s3_key, s3_output_png_key)
            
            # Append entry with laterality included in view name
            processed_data.append({
                'patient_id': patient_id,
                f'{view}_file': s3_output_png_key,
                'subtype': subtype
            })
    
    return processed_data

views = ["L_CC", "L_MLO", "R_CC", "R_MLO"]


In [86]:
# Number of items in the training metadata
train_count = len(train_metadata_df)  # or train_metadata_df.shape[0]

# Number of items in the testing metadata
test_count = len(test_metadata_df)  # or test_metadata_df.shape[0]

# Print the counts
print(f"Number of items in train_metadata_df: {train_count}")
print(f"Number of items in test_metadata_df: {test_count}")


Number of items in train_metadata_df: 2358
Number of items in test_metadata_df: 590


In [87]:
# Process metadata and save train data to S3
train_data = process_metadata(train_metadata_df, "CMMD-D2/train")

# Convert processed data to DataFrame and pivot
df_train = pd.DataFrame(train_data)
df_train_pivot = df_train.pivot_table(
    index='patient_id',
    values=[f'{view}_file' for view in views if f'{view}_file' in df_train.columns],
    aggfunc='first'
).reset_index()
df_train_pivot['subtype'] = train_metadata_df.groupby('patient_id')['subtype'].first().values
df_train_pivot['target'] = df_train_pivot['subtype'].apply(lambda x: 1 if x == "Luminal A" else 0)

# Save train metadata pivot table to S3 as Excel
train_excel_buffer = BytesIO()
df_train_pivot.to_excel(train_excel_buffer, index=False)
s3.put_object(Bucket=bucket_name, Key="CMMD-D2/train_labels.xlsx", Body=train_excel_buffer.getvalue())


File not found: CMMD-D2/D2-0402_1-3.dcm
File not found: CMMD-D2/D2-0402_1-4.dcm
File not found: CMMD-D2/D2-0402_1-3.dcm
File not found: CMMD-D2/D2-0402_1-4.dcm
File not found: CMMD-D2/D2-0311_1-3.dcm
File not found: CMMD-D2/D2-0311_1-4.dcm
File not found: CMMD-D2/D2-0124_1-3.dcm
File not found: CMMD-D2/D2-0124_1-4.dcm
File not found: CMMD-D2/D2-0296_1-3.dcm
File not found: CMMD-D2/D2-0296_1-4.dcm


{'ResponseMetadata': {'RequestId': '1NR90CFREPNEKC1M',
  'HostId': 'grFQ3i7oK3ri1qARbcycjCytQfkGc3kUFusmhgAAP69E3uQnCCvpH4NKeIO9uNowypncyGRFoMKYjHWYwsl/XyAEX2MGBxiw6cGDbRBvzUg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'grFQ3i7oK3ri1qARbcycjCytQfkGc3kUFusmhgAAP69E3uQnCCvpH4NKeIO9uNowypncyGRFoMKYjHWYwsl/XyAEX2MGBxiw6cGDbRBvzUg=',
   'x-amz-request-id': '1NR90CFREPNEKC1M',
   'date': 'Tue, 29 Oct 2024 07:08:03 GMT',
   'x-amz-version-id': '9dPazKKHH0YIO5bTUvC_egWt5KBT3QmJ',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"ead1640cbb256f26d8320f875e3354f8"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ead1640cbb256f26d8320f875e3354f8"',
 'ServerSideEncryption': 'AES256',
 'VersionId': '9dPazKKHH0YIO5bTUvC_egWt5KBT3QmJ'}

In [88]:
# Process test metadata and save to S3
test_data = process_metadata(test_metadata_df, "CMMD-D2/test")
df_test = pd.DataFrame(test_data)
df_test_pivot = df_test.pivot_table(index='patient_id', values=[f'{view}_file' for view in views if f'{view}_file' in df_test.columns], aggfunc='first').reset_index()
df_test_pivot['subtype'] = test_metadata_df.groupby('patient_id')['subtype'].first().values
df_test_pivot['target'] = df_test_pivot['subtype'].apply(lambda x: 1 if x == "Luminal A" else 0)

# Save test metadata to S3
excel_buffer = io.BytesIO()
df_test_pivot.to_excel(excel_buffer, index=False)
s3.put_object(Bucket=bucket_name, Key="CMMD-D2/test_labels.xlsx", Body=excel_buffer.getvalue())

print("Train and test metadata and images saved to S3.")

File not found: CMMD-D2/D2-0124_1-3.dcm
File not found: CMMD-D2/D2-0124_1-4.dcm
File not found: CMMD-D2/D2-0296_1-3.dcm
File not found: CMMD-D2/D2-0296_1-4.dcm
File not found: CMMD-D2/D2-0311_1-3.dcm
File not found: CMMD-D2/D2-0311_1-4.dcm
Train and test metadata and images saved to S3.
