In [1]:
from PIL import Image
import numpy as np
import pandas as pd
import os
from mammo_utils import convert_images_to_array

In [14]:
# read in the label data
labels_tr = pd.read_pickle(os.path.join("data","train_labels.pkl"))
# reset the index, drop duplicate rows, and then reset the index
labels_tr = labels_tr.reset_index().drop_duplicates(subset='IMAGE_NAME', keep='first', inplace=False)
labels_tr = labels_tr.set_index("IMAGE_NAME")

labels_te = pd.read_pickle(os.path.join("data","test_labels.pkl"))
# reset the index, drop duplicate rows, and then reset the index
labels_te = labels_te.reset_index().drop_duplicates(subset='IMAGE_NAME', keep='first', inplace=False)
labels_te = labels_te.set_index("IMAGE_NAME")


In [6]:
# convert the images to an array
mass_train_images, mass_train_labels = convert_images_to_array("data/new/Mass Train/AllJPEGS299", label_data=labels_tr)
calc_train_images, calc_train_labels = convert_images_to_array("data/new/Calc Train/AllJPEGS299", label_data=labels_tr)

train_images = np.concatenate([mass_train_images, calc_train_images], axis=0)
train_labels = np.concatenate([mass_train_labels, calc_train_labels], axis=0)

# discard the last two channels since the images are grayscale and all channels are identical
train_images = train_images[:,:,:,0].reshape(-1,299,299,1)

# save the files
np.save(os.path.join("data", "train_images299.npy"), train_images)
np.save(os.path.join("data", "train_labels.npy"), train_labels)

In [7]:
print("Train Labels:",len(train_labels))
print("Train Data:", train_images.shape)

Train Labels: 2458
Train Data: (2458, 299, 299, 1)


In [12]:
# convert the images to arrays
mass_test_images, mass_test_labels = convert_images_to_array("data/new/Mass Test/AllJPEGS299", label_data=labels_te)
calc_test_images, calc_test_labels = convert_images_to_array("data/new/Calc Test/AllJPEGS299", label_data=labels_te)

test_images = np.concatenate([mass_test_images, calc_test_images], axis=0)
test_labels = np.concatenate([mass_test_labels, calc_test_labels], axis=0)

# discard the last two channels since the images are grayscale and all channels are identical
test_images = test_images[:,:,:,0].reshape(-1,299,299,1)


# save the file
np.save(os.path.join("data", "test_images299.npy"), test_images)
np.save(os.path.join("data", "test_labels.npy"), test_labels)

In [13]:
print("Test Labels:",len(test_labels))
print("Test Data:", test_images.shape)

Test Labels: 643
Test Data: (643, 299, 299, 1)


In [8]:
train_labels = np.load("data/train_labels.npy")
test_labels = np.load("data/test_labels.npy")

In [9]:
train_labels[train_labels == 'BENIGN_WITHOUT_CALLBACK_mass'] = "BENIGN_mass"
train_labels[train_labels == 'BENIGN_WITHOUT_CALLBACK_calcification'] = "BENIGN_calcification"

test_labels[test_labels == 'BENIGN_WITHOUT_CALLBACK_mass'] = "BENIGN_mass"
test_labels[test_labels == 'BENIGN_WITHOUT_CALLBACK_calcification'] = "BENIGN_calcification"

In [10]:
np.save("data/train_labels.npy", train_labels)
np.save("data/test_labels.npy", test_labels)

In [11]:
pd.value_counts(test_labels, normalize=True)

BENIGN_mass                0.357724
BENIGN_calcification       0.255285
MALIGNANT_mass             0.235772
MALIGNANT_calcification    0.151220
dtype: float64