#### resize images so they are a uniform size

In [7]:
# packages
from PIL import Image as img # image processing 
import os
import pandas as pd
import shutil # for emptying file folders and not creaet duplicates
import random
import time
import torchvision.transforms.v2 as transforms # transform images to create larger dataset
import gc

# set wd
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Fracture/')

In [8]:
# empty folders to avoid duplicates as there has been a few iterations of naming schemes used
shutil.rmtree('./images/fracture_resize')
# recreate the empty directory
os.makedirs('./images/fracture_resize')
print("Folder emptied successfully")

shutil.rmtree('./images/non_fractured_resize')
# recreate the empty directory
os.makedirs('./images/non_fractured_resize')
print("Folder emptied successfully")

shutil.rmtree('./images/resize_data') # empty resize data
os.makedirs('./images/resize_data') # make folder
print("Folder emptied succesfully") 

Folder emptied successfully
Folder emptied successfully
Folder emptied succesfully


In [13]:
# handle any open images data...
gc.enable()
gc.collect()

0

In [9]:
# Resize the images to 224x224, which is the most common size of images. 
# Images that are not this size also only have to downscale which is more optimal
# Also added more details to the file name to make sure everyname was unqiue

# function to resize
def resize(imagePath, targetSize, save_folder, type):
    # open and resize
    image = img.open(imagePath)
    image = image.resize(targetSize)
    
    # get base filename and add type to ensure unique file names
    base_name = os.path.splitext(os.path.basename(imagePath))[0]  # get filename without extension
    new_name = f"{type}{random.randint(0,9999)}_{random.randint(0,9999)}.jpg"  # Force .jpg extension
    
    # save the new image as JPG
    save_path = os.path.join(save_folder, new_name)
    image.save(save_path, 'JPEG')
    

# iterate over elements in folder | no need here, all images are fine
for filename in os.listdir('./images/original_data/Fractured'):
    file_path = os.path.join('./images/original_data/Fractured', filename)
    resize(file_path, (224,224), './images/fracture_resize', "frac")

# had to implement try and except because a few images were corrupted in this folder 
for filename in os.listdir('./images/original_data/Non_fractured'):
    try:
        file_path = os.path.join('./images/original_data/Non_fractured', filename)
        resize(file_path, (224,224),'./images/non_fractured_resize', "nfrac")
    except:
        # if there's a file error, print the error and continue
        print(f"Error processing {filename}, skipping...")
        # delete the file
        gc.collect() # hopefully clean mem
        time.sleep(2)
        try:
            os.remove(file_path)
        except:
            print(f"error {file_path}")
        continue

Error processing IMG0004028.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004028.jpg
Error processing IMG0004029.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004029.jpg
Error processing IMG0004036.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004036.jpg
Error processing IMG0004070.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004070.jpg
Error processing IMG0004073.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004073.jpg
Error processing IMG0004076.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004076.jpg
Error processing IMG0004079.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004079.jpg
Error processing IMG0004084.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004084.jpg
Error processing IMG0004092.jpg, skipping...
error ./images/original_data/Non_fractured\IMG0004092.jpg
Error processing IMG0004098.jpg, skipping...
error ./images/original_data

In [10]:
from PIL import ImageOps

# images to greyscale, cnn only takes input of 1
def convert_to_grayscale(folder_path):
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        image = img.open(file_path)
        try: 
            # Check if image is RGB (3 channels)
            if image.mode == 'RGB':
                # Convert to grayscale
                gray_image = ImageOps.grayscale(image)
                # Save back to the same location
                gray_image.save(file_path)
        except:
            print(f"error for file {file_path}")
            continue

    print('images converted') # completion

# Convert images in both folders
convert_to_grayscale('./images/fracture_resize')
convert_to_grayscale('./images/non_fractured_resize')

images converted
images converted


In [11]:
# check diffrence to ensure file alignemnt in size so model is efficetively learning pattern
inbalance = len(os.listdir('./images/fracture_resize')) - len(os.listdir('./images/non_fractured_resize'))
print(inbalance)

2795


In [None]:
transform = transforms.Compose([ # add random crop 
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),  
        transforms.RandomRotation(30),
        transforms.ScaleJitter(target_size=(224,224), scale_range=(0.8,1.2)),
        transforms.ColorJitter(brightness=0.25, contrast=0.4, saturation=0.4, hue=0.1)

    ])

new_per_folder = 12000

# make transforms to increase data amount
for i in range(new_per_folder):
    random_filename = random.choice(os.listdir('./images/fracture_resize'))
    path = os.path.join('./images/fracture_resize', random_filename)

    # Load and apply transformations
    random_image = img.open(path)
    transformed_image = transform(random_image)

    # change filename to prevent conflicts
    base_name = os.path.splitext(os.path.basename(path))[0]  # get filename without extension
    extension = os.path.splitext(path)[1]  # get extension
    new_path = f"{base_name}_{i}alter{extension}"
    
    # save image
    try:
        transformed_image.save(os.path.join('./images/fracture_resize', new_path))
    except:
        print("image skipped")
        continue

for i in range(new_per_folder + inbalance):
    random_filename = random.choice(os.listdir('./images/non_fractured_resize'))
    path = os.path.join('./images/non_fractured_resize', random_filename)

    # Load and apply transformations
    random_image = img.open(path)
    transformed_image = transform(random_image)

    # change filename to prevent conflicts
    base_name = os.path.splitext(os.path.basename(path))[0]  # get filename without extension
    extension = os.path.splitext(path)[1]  # get extension
    new_path = f"{base_name}_{i}alter{extension}"
    
    # save image
    transformed_image.save(os.path.join('./images/non_fractured_resize', new_path))

In [4]:
# convert images in both folders to bmp type
def convert_to_bmp(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg') or filename.endswith('.jpeg'):
            file_path = os.path.join(folder_path, filename)
            try:
                # Open and convert image
                image = img.open(file_path)
                new_filename = os.path.splitext(filename)[0] + '.bmp'
                new_path = os.path.join(folder_path, new_filename)
                image.save(new_path, 'BMP')
                # Remove original jpg file
                os.remove(file_path)
            except Exception as e:
                print(f"Error converting {filename}: {e}")
                continue
        else:
            print(f"{file_path} skipped") # if non jpeg files

# Convert images in both folders
convert_to_bmp('./images/fracture_resize')
convert_to_bmp('./images/non_fractured_resize')

In [5]:
# create a matching csv to prep for pytorch data loading
data = {'name': [], 'class': []}

# iterate over file names in fracture images
for filename in os.listdir('./images/fracture_resize'):
    data['name'].append(filename)
    data['class'].append(1) # FRACTURE = 1

for filename in os.listdir('./images/non_fractured_resize'):
    data['name'].append(filename)
    data['class'].append(0) # NO FRACTURE = 0

# create dataframe
class_id = pd.DataFrame(data)
class_id.to_csv('./images/class_ids.csv', index=False) # note the slight class imbalance that potentially needs to be accounted for

In [6]:
# create a new directory for combined data
try:
    shutil.rmtree('./images/resize_data') # empty resize data
    os.makedirs('./images/resize_data')
    print("Created new resize_data directory") 
except Exception as e:
    os.makedirs('./images/resize_data', exist_ok=True)
    print("Using existing resize_data directory")

# copy fracture images
for img_file in os.listdir('./images/fracture_resize'):
    src = os.path.join('./images/fracture_resize', img_file)
    dst = os.path.join('./images/resize_data', img_file)
    if os.path.exists(src):  # Check if the source file exists
        shutil.copy2(src, dst)
# copy non-fracture images
for img_file in os.listdir('./images/non_fractured_resize'):
    src = os.path.join('./images/non_fractured_resize', img_file)
    dst = os.path.join('./images/resize_data', img_file)
    if os.path.exists(src):  # Check if the source file exists
        shutil.copy2(src, dst)
    else:
        print(f"Source file not found: {src}")
    src = os.path.join('./images/non_fractured_resize', img_file)
    dst = os.path.join('./images/resize_data', img_file)
    shutil.copy2(src, dst)

print(f"Total images in resize_data: {len(os.listdir('./images/resize_data'))}")

Created new resize_data directory


Total images in resize_data: 30868


In [2]:
# Test for corrupted images and remove them
corrupted_files = []

for filename in os.listdir('./images/resize_data'):
    file_path = os.path.join('./images/resize_data', filename)
    try:
        # Try to open the image
        with img.open(file_path) as image:
            # Try to load the image data
            image.verify()
            image.close()
            
            # Double check by trying to load it again
            image = img.open(file_path)
            image.load()
            
    except Exception as e:
        print(f"Corrupted file found: {filename}")
        corrupted_files.append(filename)
        # Remove the corrupted file
        os.remove(file_path)
        
        # Also remove it from the class_id DataFrame if it exists
        class_id = class_id[class_id['name'] != filename]

print(f"Number of corrupted files removed: {len(corrupted_files)}")
if corrupted_files:
    print("Corrupted files:", corrupted_files)
    # Save updated class_id DataFrame
    class_id.to_csv('./images/class_ids.csv', index=False)

Number of corrupted files removed: 0
