In [1]:
import os
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import math

def detect_and_calculate_area_ratio(image):
    """
    Given a PIL.Image object, this function detects white object(s) against a black
    background in a binary image, calculates their area, and returns the area ratio
    of the white object(s) to the total image area.
    """

    # Convert the PIL Image to an OpenCV usable format
    image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Threshold the image (assuming objects are white and the background is black)
    _, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)

    # Find contours in the binary image
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Calculate the total area of the contours (objects)
    total_object_area = sum(cv2.contourArea(cnt) for cnt in contours)

    # Calculate the total image area
    total_image_area = image.shape[0] * image.shape[1]

    # Calculate and return the area ratio
    return total_object_area / total_image_area


def gather_images(directory):
    # Stores the paths of .png images
    png_paths = []

    # Stores the paths of .png images where the area of the object depicted
    # is more than 0.7 of the total image or less than 0.1
    target_images = []

    # Walk through all directories and files in the given directory
    for root, dirs, files in os.walk(directory):
        print(files)
        for file in files:
            if file.endswith('.png'):
                path = os.path.join(root, file)
                png_paths.append(path)

                # Open the image file
                with Image.open(path) as img:
                    ratio = detect_and_calculate_area_ratio(img)

                    # Check the condition and append to list if it matches
                    if ratio is not None and (ratio < 0.75): # This should be somewhere between 0.4 and 0.5
                        target_images.append(path)

    return png_paths, target_images

def display_images(image_paths, fig_size):
    num_images = len(image_paths)

    # Calculate number of rows needed
    num_rows = math.ceil(num_images / 10)

    # Create subplots
    fig, axs = plt.subplots(num_rows, 10, figsize=(20, fig_size))

    # Remove axes for any unused subplot spaces
    if num_images % 10 != 0:
        for ax in axs.flatten()[num_images:]:
            ax.axis('off')

    # Flatten the array of axes and iterate through them along with your images
    for ax, image_path in zip(axs.flatten(), image_paths):
        # Open the image file
        img = Image.open(image_path)
        # Display the image on the current axes
        ax.imshow(img)
        # Remove the axes for this subplot
        ax.axis('off')

    # Show the plot with all images
    plt.show()

In [None]:
image_path = 'edof_new_sma_2023_cells'
png_paths, target_images = gather_images(image_path)

In [11]:
print(len(target_images))
print(len(png_paths))

579
2737


In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from skimage import io, color, filters, measure
import os
import matplotlib.patches as patches
from scipy import stats

def get_image_descriptors(image_paths):
    # Initialize an empty list to store each image's descriptor dictionary
    all_descriptors = []

    for image_path in image_paths:
        # Check if the file exists
        if not os.path.isfile(image_path):
            print(f"File {image_path} not found.")
            continue

        # Read the image
        image = io.imread(image_path)

        # If the image is not grayscale, convert it to grayscale
        if len(image.shape) > 2:
            gray_image = color.rgb2gray(image)

        # Threshold the image to get a binary image
        threshold_value = filters.threshold_otsu(gray_image)
        binary_image = gray_image < threshold_value

        # Label the image
        label_image = measure.label(binary_image)

        # Use regionprops to get the descriptors
        regions = measure.regionprops(label_image, intensity_image=gray_image)

        # If there are no regions, continue to the next image
        if not regions:
            continue

        # Select the region with the largest area
        region = max(regions, key=lambda region: region.area)

        descriptor_dict = {
            'area': region.area,
            'filled_area': region.filled_area,
            'equivalent_diameter': region.equivalent_diameter,
            'eccentricity': region.eccentricity,
            'convex_area': region.convex_area,
            'extent': region.extent,
            'solidity': region.solidity,
            'perimeter': region.perimeter,
            'image_path': image_path
        }

        all_descriptors.append(descriptor_dict)

    # Convert the list of dictionaries into a DataFrame
    df_1 = pd.DataFrame(all_descriptors)

    df = pd.DataFrame(all_descriptors)

    # Add an extra column 'Source' for identification
    df['Source'] = df['image_path'].str.contains('sma')

    # Reshape the dataframe suitable for sns.boxplot
    df_melt = df.melt(id_vars=['Source', 'image_path'])

    # Get the unique column names (variables)
    columns = df_melt['variable'].unique()

    # Create a subplot for each column
    for col in columns:
        subset = df_melt[df_melt['variable'] == col]
        plt.figure(figsize=(5, 4))
        sns.boxplot(x='variable', y='value', hue='Source', data=subset, palette='PRGn')
        plt.title(f"{col}")  # 2 decimal places in scientific notation
        plt.show()
    
    return df_1

In [None]:
df = get_image_descriptors(png_paths)

In [14]:
len(df)

2737

In [None]:
# Filter the DataFrame for small regions
small_regions_df_count = df[df['area'] < 5000].count()
print(small_regions_df_count)

In [None]:
# Filter the DataFrame
small_regions_df = df[df['area'] < 5000]

# Get the unique image paths
small_regions_paths = small_regions_df['image_path'].unique()

# Determine the number of rows needed for the grid
n_images = len(small_regions_paths)
n_cols = 10
n_rows = n_images // n_cols if n_images % n_cols == 0 else n_images // n_cols + 1

# Create a grid of subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 2 * n_rows))

# Flatten the axes array, so we can easily iterate over it
axes = axes.flatten()

# Iterate over the image paths and the axes
for ax, image_path in zip(axes, small_regions_paths):
    # Read the image
    image = io.imread(image_path)

    # Display the image
    ax.imshow(image)
    ax.set_title(f"Image {image_path}")
    ax.axis('off')

# If there are fewer images than total subplots, remove the extras
for ax in axes[n_images:]:
    fig.delaxes(ax)

# Display the plot
plt.tight_layout()
plt.show()

small_regions_paths_list = list(small_regions_df['image_path'].unique())
small_regions_paths_list

In [17]:
# Filter the DataFrame for small regions
large_regions_df_count = df[df['area'] > 13000].count()
print(large_regions_df_count)

area                   248
filled_area            248
equivalent_diameter    248
eccentricity           248
convex_area            248
extent                 248
solidity               248
perimeter              248
image_path             248
dtype: int64


In [None]:
# Filter the DataFrame
large_regions_df = df[df['area'] > 13000]

# Get the unique image paths
large_regions_paths = large_regions_df['image_path'].unique()

# Determine the number of rows needed for the grid
n_images = len(large_regions_paths)
n_cols = 10
n_rows = n_images // n_cols if n_images % n_cols == 0 else n_images // n_cols + 1

# Create a grid of subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 2 * n_rows))

# Flatten the axes array, so we can easily iterate over it
axes = axes.flatten()

# Iterate over the image paths and the axes
for ax, image_path in zip(axes, large_regions_paths):
    # Read the image
    image = io.imread(image_path)

    # Display the image
    ax.imshow(image)
    ax.set_title(f"Image {image_path}")
    ax.axis('off')

# If there are fewer images than total subplots, remove the extras
for ax in axes[n_images:]:
    fig.delaxes(ax)

# Display the plot
plt.tight_layout()
plt.show()

large_regions_paths_list = list(large_regions_df['image_path'].unique())

large_regions_paths_list

In [24]:
png_paths = list(filter(lambda path: path not in small_regions_paths_list, png_paths))
png_paths = list(filter(lambda path: path not in large_regions_paths_list, png_paths))
len(png_paths)

2279

In [25]:
# Filter the DataFrame for small regions
large_regions_df_count = df[df['perimeter'] > 750].count()
print(large_regions_df_count)

area                   288
filled_area            288
equivalent_diameter    288
eccentricity           288
convex_area            288
extent                 288
solidity               288
perimeter              288
image_path             288
dtype: int64


In [None]:
# Filter the DataFrame
p_large_regions_df = df[df['perimeter'] > 750]

# Get the unique image paths
p_large_regions_paths = p_large_regions_df['image_path'].unique()

# Determine the number of rows needed for the grid
n_images = len(p_large_regions_paths)
n_cols = 6
n_rows = n_images // n_cols if n_images % n_cols == 0 else n_images // n_cols + 1

# Create a grid of subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 2 * n_rows))

# Flatten the axes array, so we can easily iterate over it
axes = axes.flatten()

# Iterate over the image paths and the axes
for ax, image_path in zip(axes, p_large_regions_paths):
    # Read the image
    image = io.imread(image_path)

    # Display the image
    ax.imshow(image)
    # Get the title
    title = '/non-sma' + image_path.split('/non-sma')[-1] if '/non-sma' in image_path else '/sma' + image_path.split('/sma')[-1]

    # Set the title
    ax.set_title(f"Image {title}")
    ax.axis('off')


# If there are fewer images than total subplots, remove the extras
for ax in axes[n_images:]:
    fig.delaxes(ax)

# Display the plot
plt.tight_layout()
plt.show()

p_large_regions_paths_list = list(p_large_regions_df['image_path'].unique())

p_large_regions_paths_list

In [27]:
png_paths = list(filter(lambda path: path not in p_large_regions_paths_list, png_paths))
len(png_paths)

2279

In [None]:
df = get_image_descriptors(png_paths)

In [29]:
# Concatenate the lists
all_paths = target_images + p_large_regions_paths_list + large_regions_paths_list + small_regions_paths_list 
print(len(all_paths))

# Create a DataFrame with all paths under column 'Paths'
df = pd.DataFrame(all_paths, columns=['Paths'])
df = df.drop_duplicates()
print(len(df))

# Save the DataFrame into a CSV file
df.to_csv('images_to_remove1.csv', index=False)

1129
787
