In [1]:

# Standard library imports
import os
import sys
import logging

# Third-party imports
import pandas as pd
import numpy as np
import cv2
import seaborn as sns

from matplotlib import pyplot as plt

# Local module imports
sys.path.append(os.path.abspath('..'))
from utils.utils_data import get_classifications
from utils.utils_constants import (VESSEL_NEPTUNE_PAT_INFO_W_SCORE_PATH as  VESSEL_PAT_INFO_W_SCORE_PATH,
                                   VESSEL_NEPTUNE_PAT_INFO_W_SCORE_W_FEATURE_PATH as  VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH,
                                   DISEASE_TYPES, ARTERY_TYPES, CLASSIFICATION_SEVERITY_MAPPING,
                                   FEATURES_PATH, CROPPED_VESSELS_DIR)

from utils.utils_vis import gallery_view

# Logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
suffix = "_measurements"

In [10]:
feature_names = ['Media Area Ratio', 'Intima Area Ratio', 'Lumen Area Ratio', 'Hyalinosis Area Ratio', 'Aspect Ratio', 'Convexity', 
                 'Intima Average', 'Intima Median', 'Intima Variance', 'Intima Peak Height', 'Intima Peak Prominence', 
                 'Media Average', 'Media Median', 'Media Variance', 'Media Peak Height', 'Media Peak Prominence', 
                 'Ratio Average', 'Ratio Median', 'Ratio Variance', 'Ratio Peak Height', 'Ratio Peak Prominence']

In [12]:
features.loc[:, feature_names]

Unnamed: 0,Media Area Ratio,Intima Area Ratio,Lumen Area Ratio,Hyalinosis Area Ratio,Aspect Ratio,Convexity,Intima Average,Intima Median,Intima Variance,Intima Peak Height,...,Media Average,Media Median,Media Variance,Media Peak Height,Media Peak Prominence,Ratio Average,Ratio Median,Ratio Variance,Ratio Peak Height,Ratio Peak Prominence
0,0.447457,0.126998,0.425545,0,1.470588,0.874192,0.073192,0.06893,0.000466,0.116673,...,0.224063,0.230494,0.001232,0.268764,0.10151,0.244531,0.233086,0.003766,0.397297,0.397297
1,0.761891,0.090428,0.147681,0,1.567623,0.705036,0.04415,0.047341,0.000287,0.075651,...,0.112401,0.097581,0.003124,0.372943,0.296969,0.290994,0.301833,0.009101,0.430258,0.351727
2,0.561021,0.252452,0.186527,0,0.612316,0.800356,0.062841,0.062405,0.000772,0.198445,...,0.080106,0.072665,0.001035,0.233525,0.174877,0.444989,0.442482,0.019992,0.715234,0.681892
3,0.646114,0.185384,0.168502,0,1.026316,0.94727,0.097852,0.079516,0.002494,0.194478,...,0.214187,0.192767,0.003271,0.385167,0.240482,0.303977,0.279259,0.011658,0.546768,0.317191
4,0.575528,0.225945,0.198527,0,0.638767,0.965174,0.108965,0.098914,0.001989,0.198643,...,0.173128,0.158489,0.002272,0.326555,0.21276,0.378544,0.380763,0.011444,0.584382,0.349198
5,0.519291,0.250709,0.23,0,0.353535,0.89567,0.094955,0.076817,0.002946,0.298103,...,0.138377,0.141377,0.001166,0.237526,0.139236,0.390762,0.379442,0.009464,0.643141,0.643141
6,0.613278,0.224312,0.16241,0,0.862162,0.961892,0.117778,0.114481,0.000998,0.22196,...,0.20463,0.21904,0.002381,0.288639,0.164009,0.367217,0.371035,0.00467,0.562526,0.277592
7,0.58314,0.242939,0.173922,0,1.225589,0.966396,0.114183,0.107513,0.00074,0.168567,...,0.194443,0.192215,0.003185,0.317925,0.222311,0.379299,0.383493,0.010482,0.577459,0.369183
8,0.59771,0.208017,0.194273,0,0.723906,0.977767,0.104337,0.096108,0.001515,0.204264,...,0.19707,0.194717,0.000908,0.282058,0.091831,0.337557,0.325492,0.006199,0.536237,0.221835
9,0.492298,0.174575,0.333126,0,0.693624,0.968247,0.052993,0.047668,0.000545,0.124361,...,0.126338,0.109012,0.005137,0.388675,0.341328,0.329012,0.325166,0.010205,0.515031,0.515031


In [11]:
pat_df = pd.read_csv(VESSEL_PAT_INFO_W_SCORE_PATH)
feature_path = FEATURES_PATH.replace(".xlsx", f"{suffix}.xlsx")

agg_feature_path = VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH.replace(".csv", f"{suffix}.csv")

available_sheetnames = pd.ExcelFile(feature_path, engine='openpyxl').sheet_names
logging.info(f"{len(pat_df)} slides selected, {len(pat_df) - len(available_sheetnames)} discarded, " 
            f"{len(available_sheetnames)} left for analysis.")

collected_features = []
for i, (index, row) in enumerate(pat_df.iterrows()):
    slide_filename = row["WSI_Selected"]
    logging.info(f"Processing: {i+1}/{len(pat_df)}: {slide_filename}")

    slide_basename = os.path.splitext(slide_filename)[0]
    features = get_classifications(feature_path, slide_basename, available_sheetnames, remove_others=False)
    if features.empty:
        continue  # Skip to if no relevant data

    print(features.loc[:, feature_names])
    break

#     for disease_type in DISEASE_TYPES:
#         features[f"{disease_type} Severity"] = features[f"{disease_type} Severity"].map(CLASSIFICATION_SEVERITY_MAPPING)

#     for artery_type in ARTERY_TYPES:
#         for feature_name in feature_names:
#             features_series = features[(features['Artery Type'] == artery_type)][feature_name]
#             if not features_series.empty:            
#                 max_severity = features_series.max()
#                 mean_severity = features_series.mean()
#                 median_severity = features_series.median()
#                 percentile_75th = features_series.quantile(0.75)
#                 percentile_25th = features_series.quantile(0.25)
#             else:
#                 max_severity = mean_severity = median_severity = percentile_75th = percentile_25th =  -1  # or another indicator for no data

#             # Storing these values in the DataFrame
#             pat_df.loc[index, f'Max_{feature_name}_in_{artery_type}'.replace(" ", "_")] = max_severity
#             pat_df.loc[index, f'Mean_{feature_name}_in_{artery_type}'.replace(" ", "_")] = mean_severity
#             pat_df.loc[index, f'Median_{feature_name}_in_{artery_type}'.replace(" ", "_")] = median_severity
#             pat_df.loc[index, f'75th_{feature_name}_in_{artery_type}'.replace(" ", "_")] = percentile_75th
#             pat_df.loc[index, f'25th_{feature_name}_in_{artery_type}'.replace(" ", "_")] = percentile_25th

# pat_df.to_csv(agg_feature_path, index=False)

#     collected_features.append(features)
# collected_features = pd.concat(collected_features, ignore_index=True)

2024-07-19 19:43:38,736 - INFO - 247 slides selected, 15 discarded, 232 left for analysis.
2024-07-19 19:43:38,739 - INFO - Processing: 1/247: 10_26609_000_002_L2_TRI.svs


    Media Area Ratio  Intima Area Ratio  Lumen Area Ratio  \
0           0.447457           0.126998          0.425545   
1           0.761891           0.090428          0.147681   
2           0.561021           0.252452          0.186527   
3           0.646114           0.185384          0.168502   
4           0.575528           0.225945          0.198527   
5           0.519291           0.250709          0.230000   
6           0.613278           0.224312          0.162410   
7           0.583140           0.242939          0.173922   
8           0.597710           0.208017          0.194273   
9           0.492298           0.174575          0.333126   
10          0.496922           0.291276          0.211803   
11          0.668229           0.153718          0.178054   

    Hyalinosis Area Ratio  Aspect Ratio  Convexity  Intima Average  \
0                       0      1.470588   0.874192        0.073192   
1                       0      1.567623   0.705036        0.044150

In [None]:
# Assuming collected_df is your DataFrame
collected_features_wo_measurements = collected_features[collected_features["Intima Average"].isna()]
images = []
titles = []
for index, row in collected_features_wo_measurements.iterrows():
    print(f"Image with NaN 'Intima Average': {row['Image Name']}")
    img_name = row["Image Name"]
    img_w_ann_path = os.path.join(CROPPED_VESSELS_DIR, row["Artery Type"], img_name.replace(".png", "_w_ann.png"))

    img_w_ann = cv2.cvtColor(cv2.imread(img_w_ann_path), cv2.COLOR_BGR2RGB)
    images.append(img_w_ann)
    titles.append("")

gallery_view(images, titles)

In [None]:
collected_features.columns

In [None]:
collected_features["Intima Area"] = collected_features["Intima Area"] / collected_features["Artery Area"]
collected_features["Media Area"] = collected_features["Media Area"] / collected_features["Artery Area"]
collected_features["Lumen Area"] = collected_features["Lumen Area"] / collected_features["Artery Area"]
collected_features["Hyalinosis Area"] = collected_features["Hyalinosis Area"] / collected_features["Artery Area"]
# Ensure there are no zero or negative values to avoid NaN results
collected_features['Log Artery Area'] = np.log(collected_features['Artery Area'])



In [None]:
selected_collected_features = collected_features[(~collected_features["Intima Average"].isna())
                                                 & (collected_features["Artery Type"] == "Arterioles") 
                                                #  & (collected_features["Arteriosclerosis Severity"] == 0)
                                                 & (collected_features["Lumen Area"] >= 0.05)
                                                 & (collected_features["Media Area"] >= 0.25)
                                                 & (collected_features["Hyalinosis Severity"] == 0)
                                                 & (collected_features["Hyalinosis Area"] == 0)
                                                 ]
selected_collected_features.head()

In [None]:
# Create a box plot
import scipy.stats as stats
plt.figure(figsize=(10, 6))
# Assuming selected_collected_features is already your DataFrame
intima_averages = selected_collected_features['Intima Average']
severity_levels = selected_collected_features['Arteriosclerosis Severity']

# Compute Pearson correlation
correlation, p_value = stats.pearsonr(intima_averages, severity_levels)

# Determine how to display the p-value in the title
if p_value < 0.05:
    p_value_text = "p-value < 0.05"
else:
    p_value_text = f"p-value = {p_value:.3f}"

# Create the scatter plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Arteriosclerosis Severity', y='Intima Average', data=selected_collected_features)

# Format the title to include correlation and p-value, adjusted for significance
title_text = f'Scatter Plot of Intima Average vs Arteriosclerosis Severity\n' \
             f'Pearson Correlation: {correlation:.3f} ({p_value_text})'
plt.title(title_text)
plt.xlabel('Arteriosclerosis Severity')
plt.ylabel('Intima Average')
plt.grid(True)
plt.show()

In [None]:
def rgb_to_mask_intra_arterial(rgb_image):
    # Define the mapping from RGB colors to class labels
    color_to_label = {
        (255, 0, 0): 1,   # Outer contour in red
        (0, 255, 0): 2,   # Middle contours in green
        (0, 0, 255): 3,   # Inner contours in blue
        (0, 0, 0): 0      # Background
    }
    label_mask = np.zeros((rgb_image.shape[0], rgb_image.shape[1]), dtype=np.uint8)
    for color, label in color_to_label.items():
        # Create a mask for each color matching
        matches = np.all(rgb_image == np.array(color, dtype=np.uint8), axis=-1)
        label_mask[matches] = label
    return label_mask

def calculate_mean(image, mask, class_label):
    # Extract pixels in the image corresponding to the class label in the mask
    class_pixels = image[mask == class_label]
    # Calculate and return mean
    mean_value = np.mean(class_pixels)
    return mean_value


# Filter data for 'Arteriosclerosis Severity' == 0
severity_0_data = selected_collected_features[selected_collected_features['Arteriosclerosis Severity'] == 0]

# Calculate Q1, Q3, and IQR
Q1 = severity_0_data['Intima Average'].quantile(0.25)
Q3 = severity_0_data['Intima Average'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers
outliers = severity_0_data[(severity_0_data['Intima Average'] < (Q1 - 1.5 * IQR)) | 
                            (severity_0_data['Intima Average'] > (Q3 + 1.5 * IQR))]
images = []
titles = []
count = 0
for index, row in outliers.iterrows():
    count+=1
    if count > 10:break
    img_name = row["Image Name"]
    lumen_area = row["Lumen Area"]
    media_area = row["Media Area"]
    log_artery_area = row["Log Artery Area"]

    img_path = os.path.join(CROPPED_VESSELS_DIR, row["Artery Type"], img_name.replace(".png", "_ori.png"))
    img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
    img_gray = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2GRAY)

    mask_wo_hya_path = os.path.join(CROPPED_VESSELS_DIR, row["Artery Type"], img_name.replace(".png", "_mask_wo_hya.png"))
    mask_wo_hya = cv2.cvtColor(cv2.imread(mask_wo_hya_path), cv2.COLOR_BGR2RGB)
    label_mask = rgb_to_mask_intra_arterial(mask_wo_hya)

    mean_lumen = calculate_mean(img, label_mask, 3)
    mean_intima = calculate_mean(img, label_mask, 2)
    mean_media = calculate_mean(img, label_mask, 1)

    var_intima_media = np.var([mean_intima, mean_media], axis=0)  # Calculate variance for each class across all images

    # Display images side by side
    plt.figure(figsize=(12, 6))  # Increase figure size for better visibility
    plt.subplot(1, 3, 1)  # 1 row, 2 columns, 1st subplot
    plt.imshow(img)
    # plt.title(f'Original Image - {img_name}')
    plt.axis('off')  # Hide axes to emphasize images

    plt.subplot(1, 3, 2)  # 1 row, 2 columns, 2nd subplot
    plt.imshow(img_gray)
    # plt.title('Mask without Hyalinosis')
    plt.axis('off')

    plt.subplot(1, 3, 3)  # 1 row, 2 columns, 2nd subplot
    plt.imshow(label_mask)
    # plt.title('Mask without Hyalinosis')
    plt.axis('off')

    plt.suptitle(f'Lumen Area: {lumen_area:.3f}, Media Area: {media_area:.3f}, Var: {var_intima_media:.3f}', fontsize=16)
    plt.show()
    # break

#     img_w_ann_path = os.path.join(CROPPED_VESSELS_DIR, row["Artery Type"], img_name.replace(".png", "_w_ann.png"))
#     img_w_ann = cv2.cvtColor(cv2.imread(img_w_ann_path), cv2.COLOR_BGR2RGB)
#     images.append(img_w_ann)
#     titles.append(f"L: {lumen_area:.2f}, M: {media_area:.2f}, LAA: {log_artery_area:.2f}")

# gallery_view(images, titles)

In [None]:
np.var([0, 2])

In [None]:
mask_wo_hya_path

In [None]:
img_path

In [None]:
severity_1_data = selected_collected_features[(selected_collected_features['Arteriosclerosis Severity'] > 0)]
images = []
titles = []
for index, row in severity_1_data.iterrows():
    print(f"Image with NaN 'Intima Average': {row['Image Name']}")
    img_name = row["Image Name"]
    img_w_ann_path = os.path.join(CROPPED_VESSELS_DIR, row["Artery Type"], img_name.replace(".png", "_w_ann.png"))
    img_w_ann = cv2.cvtColor(cv2.imread(img_w_ann_path), cv2.COLOR_BGR2RGB)
    images.append(img_w_ann)
    titles.append("")

gallery_view(images, titles)


In [None]:
# Scatter plot of 'Intima Average' vs 'Intima Peak Height'
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Lumen Area', y='Log Artery Area',
                data=outliers)
plt.title('Scatter Plot of Intima Average vs Intima Peak Height')
# plt.xlabel('Intima Average')
# plt.ylabel('Intima Peak Height')
plt.grid(True)
plt.show()


In [None]:
# Scatter plot of 'Intima Average' vs 'Intima Peak Height'
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Lumen Area', y='Log Artery Area', hue="Arteriosclerosis Severity",
                data=severity_1_data)
plt.title('Scatter Plot of Intima Average vs Intima Peak Height')
# plt.xlabel('Intima Average')
# plt.ylabel('Intima Peak Height')
plt.grid(True)
plt.show()
