In [1]:

# Standard library imports
import os
import sys
import logging

# Third-party imports
import pandas as pd
import numpy as np
import cv2

import seaborn as sns
import scipy

import scipy.stats as stats

from matplotlib import pyplot as plt

# Local module imports
sys.path.append(os.path.abspath('..'))
from utils.utils_data import get_veesel_sheets, get_measurements
from utils.utils_constants import (VESSEL_NEPTUNE_PAT_INFO_W_SCORE_PATH as  VESSEL_PAT_INFO_W_SCORE_PATH,
                                   VESSEL_NEPTUNE_PAT_INFO_W_SCORE_W_FEATURE_PATH as  VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH,
                                   DISEASE_TYPES, ARTERY_TYPES, CLASSIFICATION_SEVERITY_MAPPING,
                                   MEASUREMENTS_DIR, FEATURES_PATH, CROPPED_VESSELS_DIR)

from utils.utils_vis import gallery_view

# Logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
base_feature_names = [
'Log Artery Area', 'Media Area Ratio', 'Intima Area Ratio', 'Lumen Area Ratio',
]
measurement_feature_names = [
                 'Intima Average',  'Intima Median', 'Intima Peak Height', 'Intima Peak Prominence',
                 'Media Average',   'Media Median', 'Media Peak Height',  'Media Peak Prominence',  
                 'Ratio Average',   'Ratio Median', 'Ratio Peak Height', 'Ratio Peak Prominence',]

# measurement_feature_names = [
#                  'Intima Average',  'Intima Peak Height',  
#                  'Media Average', 'Media Peak Height',
#                  'Ratio Average',   'Ratio Peak Height']


hya_feature_names = [
'Hyalinosis Area Ratio'
]

# sub_feature_names = ['Artery Aspect Ratio',
#        'Artery Convexity', 'Intima Aspect Ratio', 'Intima Convexity']

all_features = base_feature_names + measurement_feature_names

In [3]:
discard = ["Biopsy_039_WSI_001_A12_34475_15494_1002_852.png",
           "Biopsy_167_WSI_001_A12_123358_53578_920_1202.png",
           "Biopsy_221_WSI_001_A03_13753_61887_780_908.png",
        #    "Biopsy_037_WSI_001_A10_13653_14790_603_663.png",
        #     "Biopsy_043_WSI_001_A14_66774_23206_1084_775.png",
        #     "Biopsy_043_WSI_001_A20_75208_22853_1291_751.png",
        #     "Biopsy_043_WSI_001_A26_73795_12675_152_444.png",
        #     "Biopsy_045_WSI_001_A04_86291_13710_570_623.png",
        #     "Biopsy_047_WSI_001_A03_4680_39745_401_375.png",
        #     "Biopsy_047_WSI_001_A16_17927_52099_452_486.png",
        #     "Biopsy_048_WSI_001_A03_4698_9638_440_760.png",
        #     "Biopsy_049_WSI_001_A23_92729_14041_267_386.png",
        #     "Biopsy_098_WSI_001_A01_70449_29042_232_299.png",
        #     "Biopsy_099_WSI_001_A09_122614_17008_270_618.png",
        #     "Biopsy_100_WSI_001_A14_118686_20132_347_318.png",
        #     "Biopsy_109_WSI_001_A08_106130_20638_665_590.png",
        #     'Biopsy_031_WSI_001_A18_54047_35938_422_504.png', 
        #     'Biopsy_160_WSI_001_A06_14421_16216_321_357.png', 
        #     'Biopsy_174_WSI_001_A02_22631_27305_396_651.png', 
        #     'Biopsy_203_WSI_001_A01_108871_42238_437_488.png', 
        #     'Biopsy_248_WSI_001_A04_31835_6034_292_335.png', 
        #     'Biopsy_279_WSI_001_A08_20755_31426_377_333.png', 
        #     'Biopsy_280_WSI_001_A17_15771_11902_329_344.png',
        #     'Biopsy_031_WSI_001_A18_54047_35938_422_504.png'
            ]

In [15]:
def violin_plots(df, feature_name, severity_type, ax, artery_type):
    features = df.loc[:, feature_name]
    scores = df.loc[:, f'{severity_type} Severity']
    rho, p_val = scipy.stats.pearsonr(features, scores)
    p_str = f" p<0.0001" if p_val < 0.0001 else f" p={p_val:.4f}"
    print(feature_name, rho, p_str)

    # sns.violinplot(x=f'{severity_type} Severity', y=feature_name, data=df, ax=ax)
    # # if "Ratio" in feature_name: 
    # #     feature_name = feature_name.replace("Ratio", "Intima-Media Ratio")
    # ax.set_xticks([0, 1, 2, 3], [0, 1, 2, 3], fontsize=15)
    # ax.tick_params(axis='y', labelsize=15)
    # ax.set_xlabel(f'{severity_type} Severity', fontsize=20)
    # ax.set_ylabel(f"Feature Value", fontsize=20)
    # # ax.set_title(f"{artery_type}\n$\\gamma_{{\\rho}}$={rho:.2f} {p_str}", y=-0.4, pad=-14, fontsize=20)
    # # ax.set_title(f"{feature_name}\n$\\gamma_{{\\rho}}$={rho:.2f} {p_str}", y=-0.4, pad=-14, fontsize=20)
    # ax.set_title(feature_name + ": " + r"$\rho$" + "={:.2f}".format(rho) + "," + p_str,
    #                     y=-0.28,pad=-14, fontsize=20)
def gallery_view_rows(rows):
    images, titles = [], []
    image_names = []
    # Ensure combined_classifications DataFrame is defined correctly with the right columns
    for index, row in rows.iterrows():
        # if len(images) > 10: break
        artery_type = row["Artery Type"]
        # Construct the path to the image file
        image_path = os.path.join(CROPPED_VESSELS_DIR, artery_type, 
                                    row["Image Name"].replace(".png", "_w_ann.png"))
        img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Correct function to load the image
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct color display
            images.append(img)
            # Create a title using multiple fields from the DataFrame
            title = f"B{row['Image Name'].split('_')[1]}{row['Artery ID']}-AS: {row['Arteriosclerosis Severity']}; HS: {row['Hyalinosis Severity']}"
            # Peak: {row['Intima Peak Height']}"
            titles.append(title)
        image_names.append(row["Image Name"])

    # collected_features = collected_features.dropna()
    gallery_view(images, titles, 6)
    return image_names


In [16]:
suffix = "_measurements_exclude_hya_manual"
collected_features = pd.read_csv(FEATURES_PATH.replace(".xlsx", f"{suffix}.csv"))
collected_features

Unnamed: 0,Slide Name,Image Name,Artery ID,Bounding Box,Artery Type,Arteriosclerosis Severity,Hyalinosis Severity,Artery Area,Log Artery Area,Media Area Ratio,...,Media Variance,Vis Media Peak Indice,Media Peak Height,Media Peak Prominence,Ratio Average,Ratio Median,Ratio Variance,Vis Ratio Peak Indice,Ratio Peak Height,Ratio Peak Prominence
0,10_26609_000_002_L2_TRI,Biopsy_001_WSI_001_A01_87467_29252_539_387.png,A01,"87467, 29252, 539, 387",Interlobular Arteries,0,0,61749.0,11.030833,0.447457,...,0.001232,306.0,0.268764,0.101510,0.244531,0.233086,0.003766,232.0,0.397297,0.397297
1,10_26609_000_002_L2_TRI,Biopsy_001_WSI_001_A02_86515_28505_861_584.png,A02,"86515, 28505, 861, 584",Interlobular Arteries,0,0,100195.5,11.514879,0.761891,...,0.003124,226.0,0.372943,0.296969,0.290994,0.301833,0.009101,302.0,0.430258,0.351727
2,10_26609_000_002_L2_TRI,Biopsy_001_WSI_001_A04_83812_19882_846_1293.png,A04,"83812, 19882, 846, 1293",Interlobular Arteries,0,0,282556.5,12.551634,0.561021,...,0.000590,294.0,0.165494,0.092851,0.388217,0.366463,0.008262,282.0,0.617519,0.479648
3,10_26609_000_002_L2_TRI,Biopsy_001_WSI_001_A05_85633_16311_186_182.png,A05,"85633, 16311, 186, 182",Arterioles,0,0,15934.5,9.676242,0.646114,...,0.003271,127.0,0.385167,0.240482,0.303977,0.279259,0.011658,249.0,0.546768,0.317191
4,10_26609_000_002_L2_TRI,Biopsy_001_WSI_001_A06_83026_12349_173_255.png,A06,"83026, 12349, 173, 255",Arterioles,0,0,21118.0,9.957881,0.575528,...,0.002272,111.0,0.326555,0.212760,0.378544,0.380763,0.011444,77.0,0.584382,0.349198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,12_26609_099_008 L10 TRI,Biopsy_294_WSI_001_A05_119315_43916_308_667.png,A05,"119315, 43916, 308, 667",Interlobular Arteries,0,0,100982.5,11.522703,0.688416,...,0.008817,79.0,0.498193,0.148424,0.248987,0.248495,0.001706,347.0,0.321403,0.222887
2940,12_26609_099_008 L10 TRI,Biopsy_294_WSI_001_A04_119235_44642_393_862.png,A04,"119235, 44642, 393, 862",Others,0,0,105165.5,11.563291,0.662023,...,0.003442,76.0,0.317187,0.246876,0.295191,0.279455,0.005317,114.0,0.428860,0.396667
2941,12_26609_099_008 L10 TRI,Biopsy_294_WSI_001_A03_119545_45420_198_422.png,A03,"119545, 45420, 198, 422",Others,0,0,44207.5,10.696650,0.747328,...,0.012488,266.0,0.679842,0.544910,0.229790,0.229589,0.001931,290.0,0.362872,0.224653
2942,12_26609_099_008 L10 TRI,Biopsy_294_WSI_001_A02_121150_45544_268_385.png,A02,"121150, 45544, 268, 385",Others,0,0,38816.0,10.566588,0.722060,...,0.007725,118.0,0.387187,0.132096,0.257314,0.268213,0.008253,266.0,0.491288,0.491288


In [17]:
suffix = "_measurements_exclude_hya_manual"
collected_features = pd.read_csv(FEATURES_PATH.replace(".xlsx", f"{suffix}.csv"))
collected_features.shape

selected_features_by_type = {}
# suffix_convex =  "_measurements_exclude_hya_manual_lumen_convex"
# collected_features_convex = pd.read_csv(FEATURES_PATH.replace(".xlsx", f"{suffix_convex}.csv"))
# collected_features_convex

# Find the rows where "Intima Convexity" is greater than 0.8
# mask = (collected_features["Artery Area"] < 200000) & (collected_features["Intima Convexity"] > 0.8)
# mask = (collected_features["Artery Area"] < 200000) 
# Replace the rows in collected_features with those from collected_features_convex where the condition is met
# collected_features.loc[mask] = collected_features_convex.loc[mask]


# collected_features = collected_features[collected_features["Lumen Area Ratio"] > 0.02]
# print(collected_features.shape)
# collected_features = collected_features[collected_features["Artery Area"] > 20000]
# print(collected_features.shape)
collected_features = collected_features[~collected_features["Image Name"].isin(discard)]
selected_features = all_features
# print(collected_features.shape)
for artery_type in ARTERY_TYPES + ["All"]:
    print(artery_type)
    disease_type = "Arteriosclerosis"
    if artery_type != "All":
        collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)
                                                            #  &(collected_features["Hyalinosis Severity"] == 0)
                                                            ]
    else:
        collected_features_selected = collected_features
    collected_features_selected = collected_features_selected.dropna(subset=selected_features)
    print(collected_features_selected.shape)

    X = collected_features_selected.loc[:, selected_features].values
    y = collected_features_selected.loc[:, f"{disease_type} Severity"].values

    # from imblearn.over_sampling import RandomOverSampler

    # ros = RandomOverSampler(random_state=0)
    # X_resampled, y_resampled = ros.fit_resample(X, y)

    from sklearn.tree import DecisionTreeClassifier, plot_tree
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.metrics import confusion_matrix

    # ros = RandomOverSampler(random_state=10)
    # X_resampled, y_resampled = ros.fit_resample(X, y)

    clf = DecisionTreeClassifier(random_state=0, max_depth=5)
    clf.fit(X, y )

    # y_pred = clf.predict(X)
    # # Assuming y and y_pred are defined as shown in your script snippet
    # cm = confusion_matrix(y, y_pred)
    # # Plot using seaborn for better aesthetics
    # plt.figure(figsize=(8, 6))
    # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    # plt.xlabel('Predicted Label')
    # plt.ylabel('True Label')
    # plt.title('Confusion Matrix')
    # plt.show()

    # plt.figure(figsize=(20,10))  # Set the size of the figure (adjust according to your needs)
    # plot_tree(clf, filled=True, feature_names=selected_features, class_names=["0", "1", "2", "3"])
    # plt.title("Decision Tree Visualization")
    # plt.show()

    importances = clf.feature_importances_
    feature_importance_pairs = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)
    important_features = [f_name for f_name, importance in feature_importance_pairs if importance > 0.1]
    selected_features_by_type[artery_type] = set(important_features)

    # # Print feature names in order of their importance
    # print("Features sorted by importance:")
    # for i, (feature, importance) in enumerate(feature_importance_pairs):
    #     if i < 10:
    #         print(f"{feature}: {importance}")
    # fig, axs = plt.subplots(2, len(selected_features) // 2, figsize=(5 * len(selected_features) // 2, 8))
    for i, feature_name in enumerate(selected_features):
        # fig, axs = plt.subplots(1, 1, figsize=(8, 4))
        # Calculate row and column index for axs
        row = i // (len(selected_features) // 2)  # Integer division to determine the row
        col = i % (len(selected_features) // 2)   # Modulo operation to determine the column within the row

        # Call a function 'violin_plots' assuming it is defined to create violin plots
        # 'collected_features_selected', 'feature_name', 'disease_type', and 'artery_type' are assumed to be predefined.
        # violin_plots(collected_features_selected, feature_name, disease_type, axs[row, col], artery_type) 
        violin_plots(collected_features_selected, feature_name, disease_type, axs, artery_type) 

        # plt.tight_layout()
        # plt.show()
        # save_name = f"{artery_type}_{feature_name}_{disease_type}.png".replace(" ", "_")
        # fig.savefig(os.path.join("violin_plots_all", save_name))


selected_features_by_type["Arterioles"].add("Hyalinosis Area Ratio")
# selected_features_by_type["Interlobular Arteries"].add("Hyalinosis Area Ratio")

Arterioles
(1491, 35)
Log Artery Area 0.08591590638013805  p=0.0009
Media Area Ratio -0.15189809446680166  p<0.0001
Intima Area Ratio 0.2681961122107826  p<0.0001
Lumen Area Ratio -0.09062463607818198  p=0.0005
Intima Average 0.26418831750689264  p<0.0001
Intima Median 0.26428194066154315  p<0.0001
Intima Peak Height 0.15759781073853105  p<0.0001
Intima Peak Prominence 0.03965217866221274  p=0.1259
Media Average -0.07276107479850563  p=0.0049
Media Median -0.06666744782766354  p=0.0100
Media Peak Height -0.06574591796283277  p=0.0111
Media Peak Prominence -0.050111922526587534  p=0.0530
Ratio Average 0.256046533419257  p<0.0001
Ratio Median 0.24336407403237326  p<0.0001
Ratio Peak Height 0.16626195429250118  p<0.0001
Ratio Peak Prominence 0.05905986834101838  p=0.0226
Interlobular Arteries
(693, 35)
Log Artery Area 0.2551272008760215  p<0.0001
Media Area Ratio -0.2938112284492477  p<0.0001
Intima Area Ratio 0.6877891155020736  p<0.0001
Lumen Area Ratio -0.26213869645041726  p<0.0001
In

In [None]:
len(selected_features)

In [None]:
suffix = "_measurements_exclude_hya_manual"
collected_features = pd.read_csv(FEATURES_PATH.replace(".xlsx", f"{suffix}.csv"))
collected_features.shape

selected_features_by_type = {}
# suffix_convex =  "_measurements_exclude_hya_manual_lumen_convex"
# collected_features_convex = pd.read_csv(FEATURES_PATH.replace(".xlsx", f"{suffix_convex}.csv"))
# collected_features_convex

# Find the rows where "Intima Convexity" is greater than 0.8
# mask = (collected_features["Artery Area"] < 200000) & (collected_features["Intima Convexity"] > 0.8)
# mask = (collected_features["Artery Area"] < 200000) 
# Replace the rows in collected_features with those from collected_features_convex where the condition is met
# collected_features.loc[mask] = collected_features_convex.loc[mask]


# collected_features = collected_features[collected_features["Lumen Area Ratio"] > 0.02]
# print(collected_features.shape)
# collected_features = collected_features[collected_features["Artery Area"] > 20000]
# print(collected_features.shape)
collected_features = collected_features[~collected_features["Image Name"].isin(discard)]
selected_features = ["Hyalinosis Area Ratio"]
# print(collected_features.shape)
for artery_type in ARTERY_TYPES + ["All"]:
    print(artery_type)
    disease_type = "Hyalinosis"
    if artery_type != "All":
        
        collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)
                                                            #  &(collected_features["Hyalinosis Severity"] == 0)
                                                            ]
    else:
        continue
        collected_features_selected = collected_features
    collected_features_selected = collected_features_selected.dropna(subset=selected_features)
    print(collected_features_selected.shape)

    X = collected_features_selected.loc[:, selected_features].values
    y = collected_features_selected.loc[:, f"{disease_type} Severity"].values

    for i, feature_name in enumerate(selected_features):
        fig, axs = plt.subplots(1, 1, figsize=(8, 4))
        # Calculate row and column index for axs
        # Call a function 'violin_plots' assuming it is defined to create violin plots
        # 'collected_features_selected', 'feature_name', 'disease_type', and 'artery_type' are assumed to be predefined.
        violin_plots(collected_features_selected, feature_name, disease_type, axs, artery_type) 

        plt.tight_layout()
        plt.show()
        save_name = f"{artery_type}_{feature_name}_{disease_type}.png".replace(" ", "_")
        fig.savefig(os.path.join("violin_plots_all", save_name))



In [54]:
# # collected_features = pd.read_csv(FEATURES_PATH.replace(".xlsx", f"{suffix}.csv"))

# for artery_type in ARTERY_TYPES:
#     disease_type = "Arteriosclerosis"
#     feature_names = ['Intima Average', 'Intima Peak Height', 'Media Average', 'Media Peak Height', 
#                     'Ratio Average', 'Ratio Peak Height']
#     collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)
#                                                         #  &(collected_features["Hyalinosis Severity"] == 0)
#                                                         ]
#     collected_features_selected = collected_features_selected.dropna(subset=feature_names)
#     image_names = gallery_view_rows(collected_features_selected[(collected_features_selected["Arteriosclerosis Severity"]==0) & 
#                                                   (collected_features_selected["Intima Average"] > .2)])
#     print(image_names)

In [None]:
# # 
# # artery_type = "Arterioles"
# collected_features["Artery Area"] = np.log(collected_features["Artery Area"])
# for artery_type in ARTERY_TYPES:
#     print(artery_type)
#     disease_type = "Arteriosclerosis"
#     feature_names = ["Artery Area", "Lumen Area Ratio", "Intima Area Ratio", "Media Area Ratio"]
#     collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)
#                                                         #  &(collected_features["Hyalinosis Severity"] == 0)
#                                                         ]
#     collected_features_selected = collected_features_selected.dropna(subset=feature_names)
#     # collected_features_selected = collected_features_selected[~collected_features_selected["Image Name"].isin(discard)]
#     # print(collected_features_selected.shape)

#     fig, axs = plt.subplots(1, 4, figsize=(20, 5))
#     for i, feature_name in enumerate(["Artery Area", "Lumen Area Ratio", "Intima Area Ratio", "Media Area Ratio"]):
#         violin_plots(collected_features_selected, feature_name, disease_type, axs[i], artery_type)  
#     plt.tight_layout()
#     plt.show()

In [None]:
# # Visualize the distribution of 'Artery Convexity'
# plt.figure(figsize=(10, 6))
# plt.hist(collected_features[collected_features["Artery Type"]=="Arterioles"]['Lumen Area Ratio'].dropna(), bins=100, color='blue', edgecolor='black')
# plt.title('Distribution of Artery Convexity')
# plt.xlabel('Artery Convexity')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.show()

# gallery_view_rows(collected_features[(collected_features["Artery Type"]=="Arterioles") & (collected_features["Lumen Area Ratio"]<0.05)])

In [None]:
# collected_features.head()

In [None]:

# artery_type = "Arcuate Arteries"
# disease_type = "Arteriosclerosis"
# feature_names = ['Lumen Area Ratio', 'Intima Peak Height', 'Ratio Peak Height']
# collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)]
# collected_features_selected = collected_features_selected.dropna(subset=feature_names)

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.linear_model import LogisticRegression
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Assuming collected_features_selected is ready and contains the correct columns
# X = collected_features_selected[feature_names]
# y = collected_features_selected[disease_type + " Severity"]

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Build a Random Forest model
# model = RandomForestRegressor(random_state=42)
# model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Visualize feature importance
# # feature_importances = pd.Series(model.feature_importances_, index=feature_names)
# # sns.barplot(x=feature_importances, y=feature_importances.index)
# # plt.title('Feature Importance')
# # plt.show()

# # Plot predicted vs actual
# plt.figure(figsize=(10, 6))
# plt.scatter(y_test, y_pred, alpha=0.3)
# plt.xlabel('Actual Labels')
# plt.ylabel('Predicted Labels')
# plt.title('Predicted vs Actual Severity')
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
# plt.show()

# print(f"Mean Squared Error: {mse}")
# print(f"R-squared: {r2}")


In [None]:
# rows = collected_features[(collected_features["Arteriosclerosis Severity"] == 0)
#                    & (collected_features["Ratio Average"] > 0.4)
#                 #    & (collected_features["Intima Variance"] > 0.002)
#                    & (collected_features["Artery Type"] == "Interlobular Arteries")]



In [None]:
# # 
# # artery_type = "Arterioles"
# collected_features = collected_features[collected_features["Artery Area"]>=10000]

# for artery_type in ARTERY_TYPES:
#     print(artery_type)
#     disease_type = "Arteriosclerosis"
#     feature_names = ['Intima Average', 'Intima Peak Height', 'Media Average', 'Media Peak Height', 
#                     'Ratio Average', 'Ratio Peak Height']
#     collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)
#                                                         #  &(collected_features["Hyalinosis Severity"] == 0)
#                                                         ]
#     print(collected_features_selected.shape)
#     collected_features_selected = collected_features_selected.dropna(subset=feature_names)
#     print(collected_features_selected.shape)
#     # collected_features_selected = collected_features_selected[~collected_features_selected["Image Name"].isin(discard)]
#     # print(collected_features_selected.shape)

#     fig, axs = plt.subplots(1, 6, figsize=(25, 5))
#     for i, measure_src in enumerate(["Intima", "Media", "Ratio"]):
#         for j, stat_f in enumerate(["Average", "Peak Height"]):
#             feature_name = f"{measure_src} {stat_f}"
#             violin_plots(collected_features_selected, feature_name, disease_type, axs[i*2+j], artery_type)  
#     plt.tight_layout()
#     plt.show()

In [None]:
# fig, axs = plt.subplots(1, 3, figsize=(5*3, 5))  
# for i, artery_type in enumerate(ARTERY_TYPES):
#     violin_plots(collected_features.loc[collected_features["Artery Type"]==artery_type, :], 
#                  collected_features.loc[collected_features["Hyalinosis Severity"]==0, :],
#                  "Hyalinosis Area Ratio", "Hyalinosis", axs[i], artery_type)  
# plt.tight_layout()
# plt.show()

In [None]:
# # Assuming you have already defined 'collected_features'
# # Specify the columns to check for NaNs
# columns_to_check = [
#     'Intima Average', 'Intima Median', 'Intima Variance',
#     'Vis Intima Peak Indice', 'Intima Peak Height',
#     'Intima Peak Prominence', 'Media Average', 'Media Median',
#     'Media Variance', 'Vis Media Peak Indice', 'Media Peak Height',
#     'Media Peak Prominence', 'Ratio Average', 'Ratio Median',
#     'Ratio Variance', 'Vis Ratio Peak Indice', 'Ratio Peak Height',
#     'Ratio Peak Prominence'
# ]

# # This will select the rows from collected_features where all columns in columns_to_check are NaN
# # rows_all_nan = collected_features[collected_features[columns_to_check].isna().all(axis=1)]

# # Create a boolean mask where at least one column is NaN
# at_least_one_nan = collected_features[columns_to_check].isna().any(axis=1)

# # Create a boolean mask where not all columns are NaN
# not_all_nan = ~collected_features[columns_to_check].isna().all(axis=1)

# # Combine both masks to filter rows
# rows_with_some_nans = collected_features[at_least_one_nan & not_all_nan]


# images, titles = [], []
# # Ensure combined_classifications DataFrame is defined correctly with the right columns
# for index, row in rows_with_some_nans.iterrows():
#     artery_type = row["Artery Type"]
#     # Construct the path to the image file
#     image_path = os.path.join(CROPPED_VESSELS_DIR, artery_type, 
#                                 row["Image Name"].replace(".png", "_w_ann.png"))
#     img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Correct function to load the image
#     if img is not None:
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct color display
#         images.append(img)
#         # Create a title using multiple fields from the DataFrame
#         title = f"B{row['Image Name'].split('_')[1]}{row['Artery ID']}-AS: {row['Arteriosclerosis Severity']}; HS: {row['Hyalinosis Severity']}"
#         titles.append(title)

# # collected_features = collected_features.dropna()
# gallery_view(images, titles, 4)

In [None]:
# rows_with_nan = collected_features[collected_features.isna().any(axis=1)]
# images, titles = [], []
# # Ensure combined_classifications DataFrame is defined correctly with the right columns
# for index, row in rows_with_nan.iterrows():
#     artery_type = row["Artery Type"]
#     # Construct the path to the image file
#     image_path = os.path.join(CROPPED_VESSELS_DIR, artery_type, 
#                                 row["Image Name"].replace(".png", "_w_ann.png"))
#     img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Correct function to load the image
#     if img is not None:
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct color display
#         images.append(img)
#         # Create a title using multiple fields from the DataFrame
#         title = f"B{row['Image Name'].split('_')[1]}{row['Artery ID']}-AS: {row['Arteriosclerosis Severity']}; HS: {row['Hyalinosis Severity']}"
#         titles.append(title)

# collected_features = collected_features.dropna()
# collected_features.head()

In [None]:
# gallery_view(images, titles)

In [None]:

# feature_names = ['Intima Average', 'Intima Peak Height', 'Ratio Average', 'Ratio Peak Height']
# for artery_type in ARTERY_TYPES:
#     for disease_type in DISEASE_TYPES:
#         if disease_type != "Arteriosclerosis": continue
#         fig, axs = plt.subplots(1, len(feature_names), figsize=(5*len(feature_names), 5))  
#         for i, feature_name in enumerate(feature_names):
#             violin_plots(collected_features.loc[collected_features["Artery Type"]==artery_type, :], 
#                          feature_name, disease_type, axs[i])  
#         plt.tight_layout()
#         plt.show()

In [None]:
pat_df = pd.read_csv(VESSEL_PAT_INFO_W_SCORE_PATH)
feature_path = FEATURES_PATH.replace(".xlsx", f"{suffix}.xlsx")

agg_feature_path = VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH.replace(".csv", f"{suffix}.csv")

available_sheetnames = pd.ExcelFile(feature_path, engine='openpyxl').sheet_names
logging.info(f"{len(pat_df)} slides selected, {len(pat_df) - len(available_sheetnames)} discarded, " 
            f"{len(available_sheetnames)} left for analysis.")


for i, (index, row) in enumerate(pat_df.iterrows()):
    slide_filename = row["WSI_Selected"]
    logging.info(f"Processing: {i+1}/{len(pat_df)}: {slide_filename}")
    slide_basename = os.path.splitext(slide_filename)[0]
   
    # features = get_veesel_sheets(feature_path, slide_basename, available_sheetnames, remove_others=True)
    features = collected_features[collected_features["Slide Name"] == slide_basename]
    if features.empty:
        continue  # Skip to if no relevant data
    measurements_path = os.path.join(MEASUREMENTS_DIR, f"{slide_basename}{suffix}.json")
    measurements = get_measurements(measurements_path, clean=True)

    # for disease_type in DISEASE_TYPES:
    #     features[f"{disease_type} Severity"] = features[f"{disease_type} Severity"].map(CLASSIFICATION_SEVERITY_MAPPING)

    for artery_type in ARTERY_TYPES + ["All"]:
        for feature_name in selected_features_by_type[artery_type]:
            features_series = features[(features['Artery Type'] == artery_type)][feature_name]
            if not features_series.empty:            
                max_severity = features_series.max()
                mean_severity = features_series.mean()
                median_severity = features_series.median()
                percentile_75th = features_series.quantile(0.75)
                percentile_25th = features_series.quantile(0.25)
            else:
                max_severity = mean_severity = median_severity = percentile_75th = percentile_25th =  None  # or another indicator for no data

            # Storing these values in the DataFrame
            pat_df.loc[index, f'Max_{feature_name}_in_{artery_type}'.replace(" ", "_")] = max_severity
            pat_df.loc[index, f'Mean_{feature_name}_in_{artery_type}'.replace(" ", "_")] = mean_severity
            pat_df.loc[index, f'Median_{feature_name}_in_{artery_type}'.replace(" ", "_")] = median_severity
            pat_df.loc[index, f'75th_{feature_name}_in_{artery_type}'.replace(" ", "_")] = percentile_75th
            pat_df.loc[index, f'25th_{feature_name}_in_{artery_type}'.replace(" ", "_")] = percentile_25th

pat_df.to_csv(agg_feature_path, index=False)

In [None]:
agg_feature_path

In [None]:
for x in pat_df.columns:
    if "Arterioles" in x:
        print(x)