In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input, concatenate
from sklearn.model_selection import train_test_split
from keras.applications import ResNet50
from keras.layers import Embedding, Reshape
from keras.preprocessing.image import ImageDataGenerator
import cv2
from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder
from joblib import Parallel, delayed

In [2]:
train_data = pd.read_csv('merged_data.csv')
test_data = pd.read_csv('test.csv')
train_withbox_data = pd.read_csv('train_withbox_split_clean_encode.csv')

In [3]:
#train_data 열 타입 변경
# train_data['laterality']=train_data['laterality'].astype('string')
# train_data['view']=train_data['view'].astype('category')
# train_data['file_path']=train_data['file_path'].astype('string')
# train_data['cancer'] = train_data['cancer'].astype('int')
# train_data = train_data.dropna(subset=['age'])
# train_data = train_data.dropna(subset=['density'])
# train_data = train_data.dropna(subset=['BIRADS'])
# file_path 열 생성
train_data['file_path'] = train_data.apply(lambda row: f"archive/{row.patient_id}_{row.image_id}.png", axis=1)
train_withbox_data['image_path'] = train_withbox_data.apply(lambda row: f"archive/{row.patient_id}_{row.image_id}.png", axis=1)
train_withbox_data['test_path'] = train_withbox_data.apply(lambda row: f"test/{row.patient_id}_{row.image_id}.png", axis=1)

train_data.isna().sum()


site_id                    0
patient_id                 0
image_id                   0
laterality                 0
view                       0
age                        0
cancer                     0
biopsy                     0
invasive                   0
BIRADS                     0
implant                    0
density                    0
machine_id                 0
difficult_negative_case    0
x                          0
y                          0
w                          0
h                          0
split                      0
file_path                  0
dtype: int64

In [4]:
train_withbox_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,...,density,machine_id,difficult_negative_case,x,y,w,h,split,image_path,test_path
0,0,0,2,5,640805896,L,5,68,0,0,...,255,21,1,0,357,941,1982,1,archive/5_640805896.png,test/5_640805896.png
1,1,1,2,5,940388076,R,1,68,0,0,...,255,21,0,1187,50,887,2149,1,archive/5_940388076.png,test/5_940388076.png
2,2,2,2,5,1351088028,L,1,68,0,0,...,255,21,1,4,334,814,2094,1,archive/5_1351088028.png,test/5_1351088028.png
3,3,3,2,5,1633417959,R,5,68,0,0,...,255,21,0,1097,211,985,2157,1,archive/5_1633417959.png,test/5_1633417959.png
4,4,4,1,25,1789648218,L,1,62,0,0,...,1,49,0,13,563,1655,3080,0,archive/25_1789648218.png,test/25_1789648218.png


In [5]:
# merged_data = pd.merge(train_data, train_split, on=['patient_id', 'image_id'], how='inner', suffixes=('_train', '_split'))
# merged_data.to_csv('merged_data.csv', index=False)


In [None]:
import pydicom

# DICOM 파일 불러오기
dcm_file = "1864590858.dcm"
ds = pydicom.dcmread(dcm_file)

# PixelSpacing 확인
if hasattr(ds, 'PixelSpacing'):
    pixel_spacing = ds.PixelSpacing
elif hasattr(ds, 'ImagerPixelSpacing'):
    pixel_spacing = ds.ImagerPixelSpacing
else:
    pixel_spacing = None

# Image Size 확인
rows = ds.Rows
columns = ds.Columns

print("Pixel Spacing:", pixel_spacing)
print("Image Size (rows, columns):", rows, columns)



In [None]:
def resize_image_and_bbox(image, bbox, original_size=(4605,3698)):
    # 이미지와 바운딩 박스의 원래 크기를 가져옵니다.
    target_size = np.array([image.shape[1], image.shape[0]])  # [width, height]
    bbox = np.array(bbox)  # [x, y, w, h]
    # 원래 크기와 목표 크기 사이의 비율을 계산합니다.
    ratios = np.array(target_size) / np.array(original_size)

    # 바운딩 박스의 좌표를 새로운 비율에 맞게 수정합니다.
    bbox[:2] = bbox[:2] * ratios  # x, y
    bbox[2:] = bbox[2:] * ratios  # w, h

    return image, bbox


In [None]:
import matplotlib.patches as patches

# 이미지를 출력하기 위한 figure와 axes 생성
fig, ax = plt.subplots(nrows=6, ncols=2, figsize=(10, 25))

# 이미지의 처음 10개를 확인
data_subset = train_withbox_data.tail(12)

for i, row in enumerate(data_subset.iterrows()):
    image_path = row[1]['image_path']
    bbox = [row[1]['x'], row[1]['y'], row[1]['w'], row[1]['h']]

    # 이미지 파일을 불러옵니다.
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # 이미지와 바운딩 박스를 축소합니다.
    resized_image, resized_bbox = resize_image_and_bbox(image, bbox)

    # 축소된 이미지를 출력합니다.
    ax[i % 5, i // 5].imshow(resized_image, cmap='gray')
    ax[i % 5, i // 5].set_title('Resized Image')

    # 바운딩 박스를 그린 이미지를 출력합니다.
    ax[i % 5, i // 5].add_patch(patches.Rectangle((resized_bbox[0], resized_bbox[1]), 
                                         resized_bbox[2], resized_bbox[3], 
                                         linewidth=1, edgecolor='r', facecolor='none'))
    ax[i % 5, i // 5].set_title('Image with Bounding Box')

plt.tight_layout()
plt.show()


In [None]:
#이미지 전처리 수행 유방부분 사진만 편집후 test에 저장
import cv2
import glob
import numpy as np
from tqdm import tqdm
import os
import re
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor
from joblib import Parallel, delayed

def fit_image(fname):
    X = cv2.imread(fname)
    # Some images have narrow exterior "frames" that complicate selection of the main data. Cutting off the frame
    X = X[5:-5, 5:-5]
    # regions of non-empty pixels
    output= cv2.connectedComponentsWithStats((X > 20).astype(np.uint8)[:, :, 0], 8, cv2.CV_32S)
    stats = output[2]
    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h
    X_fit = X[y1: y2, x1: x2]
    patient_id, im_id = re.findall('(\d+)_(\d+).png', os.path.basename(fname))[0]
    os.makedirs('test', exist_ok=True)
    cv2.imwrite(f'test/{patient_id}_{im_id}.png', X_fit[:, :, 0])
    return True

def fit_all_images(all_images):
    Parallel(n_jobs=4)(delayed(fit_image)(image_path) for image_path in all_images)

if __name__ == '__main__':
    
    fit_all_images(train_withbox_data['image_path'])

    np.random.seed(123)
    for fname in np.random.choice(glob.glob('test/*'), size=5):
        plt.figure(figsize=(20, 10))
        patient_id, im_id = re.findall('(\d+)_(\d+).png', os.path.basename(fname))[0]
        plt.suptitle(f'[{fname}]')
        im1 = Image.open(fname).convert('F')
        plt.subplot(121).imshow(im1, cmap='gray')
        plt.subplot(121).set_title(f'Output image {im1.size}')
        im2 = Image.open(f'{train_withbox_data[train_withbox_data["image_path"].str.contains(f"{patient_id}_{im_id}")]["image_path"].values[0]}').convert('F')
        plt.subplot(122).imshow(im2, cmap='gray')
        plt.subplot(122).set_title(f'Source image {im2.size}')
        plt.show()


In [7]:
def preprocess_image(image_path, target_size=(224, 224)):
    # Load image
    img = load_img(image_path, target_size=target_size)
    # Convert to array and normalize to range [0, 1]
    img_array = img_to_array(img) / 255.0
    return img_array

In [11]:
def create_dual_view_model(meta_data_size):
    # Define inputs
    input_CC = Input(shape=(224, 224, 3))
    input_MLO = Input(shape=(224, 224, 3))
    meta_data_input = Input(shape=(meta_data_size,))

    # Define CNN model (we use ResNet50 as example here, but you can replace with any other model)
    cnn_model = ResNet50(include_top=False, weights='imagenet')

    # Pass both inputs through the CNN model
    x1 = cnn_model(input_CC)
    x2 = cnn_model(input_MLO)

    # Flatten the output of the CNN model
    x1 = Flatten()(x1)
    x2 = Flatten()(x2)

    # Concatenate the output of the CNN model and the meta data input
    x = Concatenate()([x1, x2, meta_data_input])

    # Add classification layer
    output = Dense(1, activation='sigmoid')(x)

    # Define model
    model = Model(inputs=[input_CC, input_MLO, meta_data_input], outputs=output)

    # Compile model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [14]:
# Convert to numpy arrays
age = train_withbox_data['age'].to_numpy()
implant = train_withbox_data['implant'].to_numpy()
machine_id = train_withbox_data['machine_id'].to_numpy()

# One-hot encode machine_id
one_hot_encoder = OneHotEncoder(sparse_output=False)
machine_id = one_hot_encoder.fit_transform(machine_id.reshape(-1, 1))

# Stack all metadata into a single numpy array
metadata = np.column_stack([age, implant, machine_id])

# Create labels
labels = train_withbox_data['cancer'].to_numpy()


In [None]:
# Load preprocessed images
CC_images = np.array([preprocess_image(path) for path in train_withbox_data[train_withbox_data['view']==1]['image_path']])
MLO_images = np.array([preprocess_image(path) for path in train_withbox_data[train_withbox_data['view']==5]['image_path']])

In [None]:
# Create model
model = create_dual_view_model(metadata.shape[1])

In [None]:
# Train model
model.fit([CC_images, MLO_images, metadata], labels, epochs=10, batch_size=32)