In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from keras.utils import Sequence
import os
import numpy as np
from PIL import Image
import pydicom

2023-11-07 21:08:39.037256: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-07 21:08:39.128944: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-07 21:08:39.339440: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-07 21:08:39.339460: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-07 21:08:39.341091: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [4]:
# Load the CSV file
dataframe = pd.read_csv('train.csv')#.iloc[:330]
image_names = dataframe['image_name'].values
targets = dataframe['target'].values

In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(image_names, targets, test_size=0.2, random_state=42)

In [6]:
# Create a data generator for augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [7]:
test_datagen = ImageDataGenerator(rescale=1./255)

In [8]:
# Function to load and preprocess images
def load_dcm_image(file_path):
    dcm_image = pydicom.dcmread(file_path+".dcm")
    image_array = dcm_image.pixel_array
    image_array = Image.fromarray(image_array)
    image_array = image_array.resize((224, 224))  # Resize to the input size of the neural network
    return np.array(image_array)

In [9]:
# dir containing images
image_directory = 'train/'

# Convert image names to full paths
X_train_paths = [os.path.join(image_directory, fname) for fname in X_train]
X_test_paths = [os.path.join(image_directory, fname) for fname in X_test]

# Create image arrays
#X_train_images = np.array([load_dcm_image(path) for path in X_train_paths])
#X_test_images = np.array([load_dcm_image(path) for path in X_test_paths])

# Save for future use
#np.save('X_train_images.npy', X_train_images)
#np.save('X_test_images.npy', X_test_images)

# load arrays
X_train_images = np.load('X_train_images.npy')
X_test_images = np.load('X_test_images.npy')

In [None]:
# Build the model
base_model = MobileNetV2(weights='imagenet', include_top=False)  # Load MobileNetV2 without the top layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)  # New FC layer, output layer
embeddings = Dense(128, activation='relu')(x)  # This will be our embeddings
predictions = Dense(1, activation='sigmoid')(embeddings)  # Final prediction layer

model = Model(inputs=base_model.input, outputs=predictions)

In [9]:
# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

In [10]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Setup checkpoint to save the best model
checkpoint = ModelCheckpoint('best_model.h5', verbose=1, save_best_only=True)
model.save('pretrain_model.h5')

  saving_api.save_model(


In [8]:
X_test_images.shape

(6626, 224, 224, 3)

In [10]:
class DCMGenerator(Sequence):
    def __init__(self, image_filenames, labels, batch_size):
        self.image_filenames, self.labels = image_filenames, labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.image_filenames) / self.batch_size))

    def __getitem__(self, idx):
        batch_x = self.image_filenames[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array([
            load_dcm_image(file_name)  # Your function to load images
            for file_name in batch_x]), np.array(batch_y)

#train_generator = DCMGenerator(X_train_paths, y_train, batch_size=32)
#validation_generator = DCMGenerator(X_test_paths, y_test, batch_size=32)

In [11]:
from sklearn.model_selection import train_test_split

# Assuming you have X_train_images and y_train
X_train_new, X_val, y_train_new, y_val = train_test_split(
    X_train_paths, y_train, test_size=0.1, random_state=42
)
train_generator = DCMGenerator(X_train_new, y_train_new, batch_size=32)
validation_generator = DCMGenerator(X_val, y_val, batch_size=32)


In [15]:
# Train the model
model.fit(
    train_generator,
    validation_data=validation_generator,
    steps_per_epoch=len(X_train_new) // 32,
    validation_steps=len(X_val) // 32,
    epochs=10,
    callbacks=[checkpoint]
)

model.save('postval_model.h5')



Epoch 1/10
Epoch 1: val_loss improved from inf to 0.07632, saving model to best_model.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.07632 to 0.07302, saving model to best_model.h5
Epoch 3/10
Epoch 3: val_loss improved from 0.07302 to 0.07227, saving model to best_model.h5
Epoch 4/10
Epoch 4: val_loss did not improve from 0.07227
Epoch 5/10
Epoch 5: val_loss improved from 0.07227 to 0.07227, saving model to best_model.h5
Epoch 6/10
Epoch 6: val_loss did not improve from 0.07227
Epoch 7/10
Epoch 7: val_loss improved from 0.07227 to 0.07055, saving model to best_model.h5
Epoch 8/10
Epoch 8: val_loss did not improve from 0.07055
Epoch 9/10
Epoch 9: val_loss improved from 0.07055 to 0.06996, saving model to best_model.h5
Epoch 10/10
Epoch 10: val_loss did not improve from 0.06996


In [12]:
# Load the best model
best_model = load_model('best_model.h5')

2023-11-07 21:08:53.591671: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-07 21:08:53.591972: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [13]:
best_model.summary() # get layer name of 2nd to last layer (layer before predicitons)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None, None, 3)]      0         []                            
                                                                                                  
 Conv1 (Conv2D)              (None, None, None, 32)       864       ['input_1[0][0]']             
                                                                                                  
 bn_Conv1 (BatchNormalizati  (None, None, None, 32)       128       ['Conv1[0][0]']               
 on)                                                                                              
                                                                                                  
 Conv1_relu (ReLU)           (None, None, None, 32)       0         ['bn_Conv1[0][0]']        

In [14]:
# Fill in name of layer we want to extract embeddings from
embedding_layer_name = 'dense_1' 

embedding_model = Model(inputs=best_model.input, outputs=best_model.get_layer(embedding_layer_name).output)

# Save the embedding model
embedding_model.save('embedding_model_val.h5')




  saving_api.save_model(


In [None]:
# # Extract and save embeddings
# train_embeddings = embedding_model.predict(train_datagen.flow(X_train_images))
# test_embeddings = embedding_model.predict(test_datagen.flow(X_test_images))

# # Save the embeddings
# np.save('train_embeddings_val.npy', train_embeddings)
# np.save('test_embeddings_val.npy', test_embeddings)


In [None]:
# # We will use a smaller batch size to manage memory usage
# batch_size = 16

# # Calculate the correct number of steps per epoch
# train_embeddings_steps = len(X_train_images) // batch_size
# test_embeddings_steps = len(X_test_images) // batch_size

# # Extract embeddings for the training set
# train_embeddings = embedding_model.predict(
#     train_datagen.flow(X_train_images, batch_size=batch_size, shuffle=False),
#     steps=train_embeddings_steps,
#     verbose=1
# )

# # Extract embeddings for the test set
# test_embeddings = embedding_model.predict(
#     test_datagen.flow(X_test_images, batch_size=batch_size, shuffle=False),
#     steps=test_embeddings_steps,
#     verbose=1
# )

# # Save the embeddings
# np.save('train_embeddings_val.npy', train_embeddings)
# np.save('test_embeddings_val.npy', test_embeddings)


In [18]:
class EmbeddingGenerator(Sequence):
    def __init__(self, image_filenames, batch_size, model):
        self.image_filenames = image_filenames
        self.batch_size = batch_size
        self.model = model

    def __len__(self):
        return int(np.ceil(len(self.image_filenames) / self.batch_size))

    def __getitem__(self, idx):
        batch_x = self.image_filenames[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        # Assuming load_dcm_image function returns a preprocessed image ready for prediction
        batch_images = np.array([
            image for image in batch_x
        ])
        
        # Get embeddings
        batch_embeddings = self.model.predict_on_batch(batch_images)
        return batch_embeddings

    def save_embeddings(self, output_path_template):
        for i in range(len(self)):
            embeddings = self[i]
            np.save(output_path_template.format(i), embeddings)
            print(f"Saved batch {i+1}/{len(self)}")


In [22]:
# Create instance of EmbeddingGenerator for training and validation sets
train_embedding_generator = EmbeddingGenerator(X_train_images, batch_size=32, model=embedding_model)
test_embedding_generator = EmbeddingGenerator(X_test_images, batch_size=32, model=embedding_model)

# Save embeddings to files in chunks
train_embedding_generator.save_embeddings('train_embeds/train_embeddings_chunk_{}.npy')
test_embedding_generator.save_embeddings('test_embeds/test_embeddings_chunk_{}.npy')


Saved batch 1/829
Saved batch 2/829
Saved batch 3/829
Saved batch 4/829
Saved batch 5/829
Saved batch 6/829
Saved batch 7/829
Saved batch 8/829
Saved batch 9/829
Saved batch 10/829
Saved batch 11/829
Saved batch 12/829
Saved batch 13/829
Saved batch 14/829
Saved batch 15/829
Saved batch 16/829
Saved batch 17/829
Saved batch 18/829
Saved batch 19/829
Saved batch 20/829
Saved batch 21/829
Saved batch 22/829
Saved batch 23/829
Saved batch 24/829
Saved batch 25/829
Saved batch 26/829
Saved batch 27/829
Saved batch 28/829
Saved batch 29/829
Saved batch 30/829
Saved batch 31/829
Saved batch 32/829
Saved batch 33/829
Saved batch 34/829
Saved batch 35/829
Saved batch 36/829
Saved batch 37/829
Saved batch 38/829
Saved batch 39/829
Saved batch 40/829
Saved batch 41/829
Saved batch 42/829
Saved batch 43/829
Saved batch 44/829
Saved batch 45/829
Saved batch 46/829
Saved batch 47/829
Saved batch 48/829
Saved batch 49/829
Saved batch 50/829
Saved batch 51/829
Saved batch 52/829
Saved batch 53/829
Sa

In [24]:
import numpy as np
import os

# Train
# Get a list of all the saved files
file_path = 'train_embeds/'  # Update this to the path where your files are saved
file_names = [fn for fn in os.listdir(file_path) if fn.startswith('train_embeddings_chunk_') and fn.endswith('.npy')]

# Sort the file names to maintain the order, assuming they have been saved with an incrementing suffix
file_names.sort(key=lambda x: int(x.strip('.npy').split('_')[-1]))

# Load and concatenate the arrays
all_embeddings = np.concatenate([np.load(os.path.join(file_path, fn)) for fn in file_names], axis=0)

# Save the combined array
np.save(os.path.join(file_path, 'combined_train_embeddings.npy'), all_embeddings)

# Test
# Get a list of all the saved files
file_path = 'test_embeds/'  # Update this to the path where your files are saved
file_names = [fn for fn in os.listdir(file_path) if fn.startswith('test_embeddings_chunk_') and fn.endswith('.npy')]

# Sort the file names to maintain the order, assuming they have been saved with an incrementing suffix
file_names.sort(key=lambda x: int(x.strip('.npy').split('_')[-1]))

# Load and concatenate the arrays
all_embeddings = np.concatenate([np.load(os.path.join(file_path, fn)) for fn in file_names], axis=0)

# Save the combined array
np.save(os.path.join(file_path, 'combined_test_embeddings.npy'), all_embeddings)

In [26]:
test_loss, test_acc = best_model.evaluate(X_test_images, y_test, verbose=2)
print(f'Test accuracy: {test_acc}, Test loss: {test_loss}')

208/208 - 26s - loss: 0.0839 - accuracy: 0.9819 - 26s/epoch - 124ms/step
Test accuracy: 0.981889545917511, Test loss: 0.08389921486377716
