In [1]:
import os
import cv2
import shutil
import random
import numpy as np
import glob
import pathlib 
import random
from PIL import Image
from IPython.display import display
import matplotlib.pyplot as plt 
import hashlib

In [3]:
## DATA PREPROCESSING IN TWO DATAESTS ##
## FIRST PART: APPLY CHANGES TO IRRELEVANT DATA ##
paths = ["./Irrelevant_data/unsplash-images/",
         "./Irrelevant_data/mri/",
         "./Irrelevant_data/chest_xray/train/NORMAL",
         "./Irrelevant_data/histopathology/train/malignant/",
         "./Irrelevant_data/mammography/images/",
         "./Irrelevant_data/abdominal_US/abdominal_US/RUS/images/train/"]

In [5]:
#Duplicate removal
def remove_duplicates(folder_path):
    seen_hashes = {}
    for imagefile in glob.glob(os.path.join(folder_path, "*.[jJ][pP][gG]")) + \
                      glob.glob(os.path.join(folder_path, "*.[jJ][pP][eE][gG]")) + \
                      glob.glob(os.path.join(folder_path, "*.[pP][nN][gG]")):
        file_hash = hashlib.md5(open(imagefile, 'rb').read()).hexdigest()
        if file_hash in seen_hashes:
            os.remove(imagefile)
            print(f"Removed duplicate: {imagefile}")
        else:
            seen_hashes[file_hash] = imagefile

# Remove duplicates in paths
for path in paths:
    remove_duplicates(path)

In [9]:
for i in range(len(paths)):
    folder_files = glob.glob(os.path.join(paths[i], "*.jpg")) or glob.glob(os.path.join(paths[i], "*.jpeg")) or glob.glob(os.path.join(paths[i], "*.png"))
    print(len(folder_files))

2184
14392
1340
777
239
400


In [25]:
# List of valid image extensions
valid_image_extensions = [".jpg", ".jpeg", ".png", "jfif"]

# Function to check if the file is an image
def is_image_file(filename):
    return any(filename.lower().endswith(ext) for ext in valid_image_extensions)

# Function to mirror image horizontally
def mirror_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Failed to load image: {image_path}")
        return None
    mirrored_image = cv2.flip(image, 1)  # 1 for horizontal flip
    return mirrored_image

In [41]:
# Apply mirror transformation to each unsplash image and save
for image_file in os.listdir(paths[0]):
    if is_image_file(image_file):  # Only process image files
        image_path = os.path.join(paths[0], image_file)
        mirrored_image = mirror_image(image_path)
        if mirrored_image is not None:
            mirrored_image_path = os.path.join(paths[0], "mirrored_" + image_file)
            cv2.imwrite(mirrored_image_path, mirrored_image)

In [16]:
# Apply mirror transformation to each mammography image and save
for image_file in os.listdir(paths[4]):
    if is_image_file(image_file):  # Only process image files
        image_path = os.path.join(paths[4], image_file)
        mirrored_image = mirror_image(image_path)
        if mirrored_image is not None:
            mirrored_image_path = os.path.join(paths[4], "mirrored_" + image_file)
            cv2.imwrite(mirrored_image_path, mirrored_image)

KeyboardInterrupt: 

In [None]:
# Apply sufficient zoom to all real ultrasound images and save
for image_file in os.listdir(paths[5]):
    image_path = os.path.join(paths[5], image_file)
    image = cv2.imread(image_path)
    # Check if image was loaded successfully
    if image is None:
        continue
    zoomed_image = zoom_image_us(image)
    output_path = os.path.join(paths[5], "zoomed_" + image_file)
    cv2.imwrite(output_path, zoomed_image)

In [27]:
#Random crop/zoom function
def random_zoom_image(image, zoom_factor=1.2):
    height, width = image.shape[:2]
    new_height, new_width = int(height / zoom_factor), int(width / zoom_factor)
    top = random.randint(0, height - new_height)
    left = random.randint(0, width - new_width)
    bottom, right = top + new_height, left + new_width
    cropped_image = image[top:bottom, left:right]
    zoomed_image = cv2.resize(cropped_image, (width, height), interpolation=cv2.INTER_LINEAR)
    return zoomed_image

In [28]:
#Black and white image convertion function
def black_and_white(image_path):
    image_file = Image.open(image_path)
    bnw_image = image_file.convert('1')
    plt.imshow(bnw_image)
    return bnw_image

In [None]:
#Apply function to unsplash images and save
image_files = os.listdir(paths[0])
selected_images_zoom = random.sample(image_files, 300)
selected_images_bnw = random.sample(image_files, 300)

for image_file in selected_images_zoom:
    if is_image_file(image_file):
        image_path = os.path.join(paths[0], image_file)
        image = cv2.imread(image_path)
        zoomed_image = random_zoom_image(image)
        output_path = os.path.join(paths[0], "zoomed_" + image_file)
        cv2.imwrite(output_path, zoomed_image)

for image_file in selected_images_bnw:
    if is_image_file(image_file):
        image_path = os.path.join(paths[0], image_file)
        bnw_image = black_and_white(image_path)
        output_path = os.path.join(paths[0], "bnw_" + image_file)
        bnw_image.save(output_path)

In [10]:
#Create folders for final dataset and two categories (irrelevant images and breast ultrasound images)
os.mkdir("final_dataset1")
irrelevant_data = os.path.join("final_dataset1", "irrelevant_data")
breast_ultrasounds = os.path.join("final_dataset1", "breast_ultrasounds")
os.mkdir(irrelevant_data)
os.mkdir(breast_ultrasounds)

In [11]:
folder_path = './final_dataset1/irrelevant_data'

unspash_images = glob.glob(os.path.join(paths[0], "*.jpg"))
mri_images = glob.glob(os.path.join(paths[1], "*.jpg"))
xray_images = glob.glob(os.path.join(paths[2], "*.jpeg"))
hist_images = glob.glob(os.path.join(paths[3], "*.png"))
mamm_images = glob.glob(os.path.join(paths[4], "*.png"))
us_images = glob.glob(os.path.join(paths[5], "*.jpg"))

#Copy in the dataset all unsplash and mammography images
for image in unspash_images:
    shutil.copy(image, folder_path)

#Copy in the dataset 300 images from mri,xray,ultrasound and histopathology repsectively + half mammography
mri_images10 = random.sample(mri_images, 350)
xray_images10 = random.sample(xray_images, 350)
hist_images10 = random.sample(hist_images, 350)
mamm_images = random.sample(mamm_images, 212)

for image in mri_images10:
    shutil.copy(image, folder_path)
for image in xray_images10:
    shutil.copy(image, folder_path)
for image in hist_images10:
    shutil.copy(image, folder_path)
for image in mamm_images:
    shutil.copy(image, folder_path)

In [12]:
#Show all images in irrelevant_data foler
files = [f for f in os.listdir('./final_dataset1/irrelevant_data') if os.path.isfile(os.path.join('./final_dataset1/irrelevant_data', f))]
print(len(files))

3446


In [13]:
## SECOND PART: COPY BREAST ULTRASOUND DATA ##
folder_path = './final_dataset1/breast_ultrasounds'

benign_images = glob.glob(os.path.join("./Dataset_BUSI_with_GT/benign/", "*.png"))
malignant_images = glob.glob(os.path.join("./Dataset_BUSI_with_GT/malignant/", "*.png"))
normal_images = glob.glob(os.path.join("./Dataset_BUSI_with_GT/normal/", "*.png"))

for image in benign_images:
    shutil.copy(image, folder_path)
for image in malignant_images:
    shutil.copy(image, folder_path)
for image in normal_images:
    shutil.copy(image, folder_path)

In [14]:
#Show all images in breast_ultrasounds foler
files = [f for f in os.listdir('./final_dataset1/breast_ultrasounds') if os.path.isfile(os.path.join('./final_dataset1/breast_ultrasounds', f))]
print(len(files))

3460


In [15]:
# Train Validation and Test sets creation
base_dir = os.path.join(os.getcwd(), "final_dataset1")

irrelevant_data_dir = os.path.join(base_dir, 'irrelevant_data')
breast_ultrasounds_dir = os.path.join(base_dir, 'breast_ultrasounds')

train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

# Subdirectories for benign and malignant
for category in ['irrelevant_data', 'breast_ultrasounds']:
    os.makedirs(os.path.join(train_dir, category), exist_ok=True)
    os.makedirs(os.path.join(val_dir, category), exist_ok=True)
    os.makedirs(os.path.join(test_dir, category), exist_ok=True)

# Function to split images on a 70-15-15 percentage and copy them on the train,val and test folders
def split_and_copy_images(category, src_dir, train_dir, val_dir, test_dir, train_split=0.7, val_split=0.15, test_split=0.15):
    images = glob.glob(os.path.join(src_dir, '*'))  # * to include all files in the directory
    random.shuffle(images) #Shuffle images randomly
    total_images = len(images)
    train_count = int(total_images * train_split)
    val_count = int(total_images * val_split)
    train_images = images[:train_count]
    val_images = images[train_count:train_count + val_count]
    test_images = images[train_count + val_count:]
    
    for img in train_images:
        shutil.copy(img, os.path.join(train_dir, category))
        
    for img in val_images:
        shutil.copy(img, os.path.join(val_dir, category))
        
    for img in test_images:
        shutil.copy(img, os.path.join(test_dir, category))

# Split and copy benign images
split_and_copy_images('irrelevant_data', irrelevant_data_dir, train_dir, val_dir, test_dir)

# Split and copy malignant images
split_and_copy_images('breast_ultrasounds', breast_ultrasounds_dir, train_dir, val_dir, test_dir)

print("Images have been split and copied on the folders successfully.")

Images have been split and copied on the folders successfully.
