In [None]:

# Standard library imports
import os
import sys
import logging

# Third-party imports
import pandas as pd
import numpy as np
import cv2

import seaborn as sns
import scipy

import scipy.stats as stats

from matplotlib import pyplot as plt

# Local module imports
sys.path.append(os.path.abspath('..'))
from utils.utils_data import get_veesel_sheets, get_measurements
from utils.utils_constants import (VESSEL_NEPTUNE_PAT_INFO_W_SCORE_PATH as  VESSEL_PAT_INFO_W_SCORE_PATH,
                                   VESSEL_NEPTUNE_PAT_INFO_W_SCORE_W_FEATURE_PATH as  VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH,
                                   DISEASE_TYPES, ARTERY_TYPES, CLASSIFICATION_SEVERITY_MAPPING,
                                   MEASUREMENTS_DIR, FEATURES_PATH, CROPPED_VESSELS_DIR)

from utils.utils_vis import gallery_view

# Logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
base_feature_names = [
'Media Area Ratio', 'Intima Area Ratio', 'Lumen Area Ratio'
]
measurement_feature_names = [
                 'Intima Average', 'Intima Median', 'Intima Variance', 'Intima Peak Height', 'Intima Peak Prominence', 
                 'Media Average', 'Media Median', 'Media Variance', 'Media Peak Height', 'Media Peak Prominence', 
                 'Ratio Average', 'Ratio Median', 'Ratio Variance', 'Ratio Peak Height', 'Ratio Peak Prominence']

hya_feature_names = [
'Hyalinosis Area Ratio'
]

In [None]:
def violin_plots(df, feature_name, severity_type, ax, artery_type):
    features = df.loc[:, feature_name]
    scores = df.loc[:, f'{severity_type} Severity']
    rho, p_val = scipy.stats.pearsonr(features, scores)
    p_str = f"p<0.001" if p_val < 0.001 else f"p={p_val:.3f}"
    sns.violinplot(x=f'{severity_type} Severity', y=feature_name, data=df, ax=ax)
    # if "Ratio" in feature_name: 
    #     feature_name = feature_name.replace("Ratio", "Intima-Media Ratio")
    # ax.set_xlabel(feature_name, fontsize=20)
    ax.set_ylabel(f"{feature_name}", fontsize=20)
    ax.set_title(f"{artery_type}\n$\\gamma_{{\\rho}}$={rho:.2f} {p_str}", y=-0.4, pad=-14, fontsize=22)


In [None]:
suffix = "_measurements_exclude_hya_manual"
collected_features = pd.read_csv(FEATURES_PATH.replace(".xlsx", f"{suffix}.csv"))
collected_features.shape

In [None]:

artery_type = "Arcuate Arteries"
disease_type = "Arteriosclerosis"
feature_names = ['Lumen Area Ratio', 'Intima Peak Height', 'Ratio Peak Height']
collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)]
collected_features_selected = collected_features_selected.dropna(subset=feature_names)

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.linear_model import LogisticRegression
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Assuming collected_features_selected is ready and contains the correct columns
# X = collected_features_selected[feature_names]
# y = collected_features_selected[disease_type + " Severity"]

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Build a Random Forest model
# model = RandomForestRegressor(random_state=42)
# model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Visualize feature importance
# # feature_importances = pd.Series(model.feature_importances_, index=feature_names)
# # sns.barplot(x=feature_importances, y=feature_importances.index)
# # plt.title('Feature Importance')
# # plt.show()

# # Plot predicted vs actual
# plt.figure(figsize=(10, 6))
# plt.scatter(y_test, y_pred, alpha=0.3)
# plt.xlabel('Actual Labels')
# plt.ylabel('Predicted Labels')
# plt.title('Predicted vs Actual Severity')
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
# plt.show()

# print(f"Mean Squared Error: {mse}")
# print(f"R-squared: {r2}")


In [None]:
# rows = collected_features[(collected_features["Arteriosclerosis Severity"] == 0)
#                    & (collected_features["Ratio Average"] > 0.4)
#                 #    & (collected_features["Intima Variance"] > 0.002)
#                    & (collected_features["Artery Type"] == "Interlobular Arteries")]

# images, titles = [], []
# # Ensure combined_classifications DataFrame is defined correctly with the right columns
# for index, row in rows.iterrows():
#     artery_type = row["Artery Type"]
#     # Construct the path to the image file
#     image_path = os.path.join(CROPPED_VESSELS_DIR, artery_type, 
#                                 row["Image Name"].replace(".png", "_w_ann.png"))
#     img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Correct function to load the image
#     if img is not None:
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct color display
#         images.append(img)
#         # Create a title using multiple fields from the DataFrame
#         title = f"B{row['Image Name'].split('_')[1]}{row['Artery ID']}-AS: {row['Arteriosclerosis Severity']}; HS: {row['Hyalinosis Severity']}"
#         # Peak: {row['Intima Peak Height']}"
#         titles.append(title)

# # collected_features = collected_features.dropna()
# gallery_view(images, titles, 4)

In [None]:
discard = ["Biopsy_039_WSI_001_A12_34475_15494_1002_852.png",
           "Biopsy_167_WSI_001_A12_123358_53578_920_1202.png",
           "Biopsy_221_WSI_001_A03_13753_61887_780_908.png"]

In [None]:
# 
# artery_type = "Arterioles"
for artery_type in ARTERY_TYPES:
    print(artery_type)
    disease_type = "Arteriosclerosis"
    feature_names = ['Intima Average', 'Intima Peak Height', 'Media Average', 'Media Peak Height', 
                    'Ratio Average', 'Ratio Peak Height']
    collected_features_selected = collected_features.loc[(collected_features["Artery Type"] == artery_type)
                                                        #  &(collected_features["Hyalinosis Severity"] == 0)
                                                        ]
    print(collected_features_selected.shape)
    collected_features_selected = collected_features_selected.dropna(subset=feature_names)
    print(collected_features_selected.shape)
    # collected_features_selected = collected_features_selected[~collected_features_selected["Image Name"].isin(discard)]
    # print(collected_features_selected.shape)

    fig, axs = plt.subplots(1, 6, figsize=(25, 5))
    for i, measure_src in enumerate(["Intima", "Media", "Ratio"]):
        for j, stat_f in enumerate(["Average", "Peak Height"]):
            feature_name = f"{measure_src} {stat_f}"
            violin_plots(collected_features_selected, feature_name, disease_type, axs[i*2+j], artery_type)  
    plt.tight_layout()
    plt.show()

In [None]:
# fig, axs = plt.subplots(1, 3, figsize=(5*3, 5))  
# for i, artery_type in enumerate(ARTERY_TYPES):
#     violin_plots(collected_features.loc[collected_features["Artery Type"]==artery_type, :], 
#                  collected_features.loc[collected_features["Hyalinosis Severity"]==0, :],
#                  "Hyalinosis Area Ratio", "Hyalinosis", axs[i], artery_type)  
# plt.tight_layout()
# plt.show()

In [None]:
# # Assuming you have already defined 'collected_features'
# # Specify the columns to check for NaNs
# columns_to_check = [
#     'Intima Average', 'Intima Median', 'Intima Variance',
#     'Vis Intima Peak Indice', 'Intima Peak Height',
#     'Intima Peak Prominence', 'Media Average', 'Media Median',
#     'Media Variance', 'Vis Media Peak Indice', 'Media Peak Height',
#     'Media Peak Prominence', 'Ratio Average', 'Ratio Median',
#     'Ratio Variance', 'Vis Ratio Peak Indice', 'Ratio Peak Height',
#     'Ratio Peak Prominence'
# ]

# # This will select the rows from collected_features where all columns in columns_to_check are NaN
# # rows_all_nan = collected_features[collected_features[columns_to_check].isna().all(axis=1)]

# # Create a boolean mask where at least one column is NaN
# at_least_one_nan = collected_features[columns_to_check].isna().any(axis=1)

# # Create a boolean mask where not all columns are NaN
# not_all_nan = ~collected_features[columns_to_check].isna().all(axis=1)

# # Combine both masks to filter rows
# rows_with_some_nans = collected_features[at_least_one_nan & not_all_nan]


# images, titles = [], []
# # Ensure combined_classifications DataFrame is defined correctly with the right columns
# for index, row in rows_with_some_nans.iterrows():
#     artery_type = row["Artery Type"]
#     # Construct the path to the image file
#     image_path = os.path.join(CROPPED_VESSELS_DIR, artery_type, 
#                                 row["Image Name"].replace(".png", "_w_ann.png"))
#     img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Correct function to load the image
#     if img is not None:
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct color display
#         images.append(img)
#         # Create a title using multiple fields from the DataFrame
#         title = f"B{row['Image Name'].split('_')[1]}{row['Artery ID']}-AS: {row['Arteriosclerosis Severity']}; HS: {row['Hyalinosis Severity']}"
#         titles.append(title)

# # collected_features = collected_features.dropna()
# gallery_view(images, titles, 4)

In [None]:
# rows_with_nan = collected_features[collected_features.isna().any(axis=1)]
# images, titles = [], []
# # Ensure combined_classifications DataFrame is defined correctly with the right columns
# for index, row in rows_with_nan.iterrows():
#     artery_type = row["Artery Type"]
#     # Construct the path to the image file
#     image_path = os.path.join(CROPPED_VESSELS_DIR, artery_type, 
#                                 row["Image Name"].replace(".png", "_w_ann.png"))
#     img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Correct function to load the image
#     if img is not None:
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct color display
#         images.append(img)
#         # Create a title using multiple fields from the DataFrame
#         title = f"B{row['Image Name'].split('_')[1]}{row['Artery ID']}-AS: {row['Arteriosclerosis Severity']}; HS: {row['Hyalinosis Severity']}"
#         titles.append(title)

# collected_features = collected_features.dropna()
# collected_features.head()

In [None]:
# gallery_view(images, titles)

In [None]:

# feature_names = ['Intima Average', 'Intima Peak Height', 'Ratio Average', 'Ratio Peak Height']
# for artery_type in ARTERY_TYPES:
#     for disease_type in DISEASE_TYPES:
#         if disease_type != "Arteriosclerosis": continue
#         fig, axs = plt.subplots(1, len(feature_names), figsize=(5*len(feature_names), 5))  
#         for i, feature_name in enumerate(feature_names):
#             violin_plots(collected_features.loc[collected_features["Artery Type"]==artery_type, :], 
#                          feature_name, disease_type, axs[i])  
#         plt.tight_layout()
#         plt.show()

In [None]:
pat_df = pd.read_csv(VESSEL_PAT_INFO_W_SCORE_PATH)
feature_path = FEATURES_PATH.replace(".xlsx", f"{suffix}.xlsx")

agg_feature_path = VESSEL_PAT_INFO_W_SCORE_W_FEATURE_PATH.replace(".csv", f"{suffix}.csv")

available_sheetnames = pd.ExcelFile(feature_path, engine='openpyxl').sheet_names
logging.info(f"{len(pat_df)} slides selected, {len(pat_df) - len(available_sheetnames)} discarded, " 
            f"{len(available_sheetnames)} left for analysis.")


for i, (index, row) in enumerate(pat_df.iterrows()):
    slide_filename = row["WSI_Selected"]
    logging.info(f"Processing: {i+1}/{len(pat_df)}: {slide_filename}")
    slide_basename = os.path.splitext(slide_filename)[0]
   
    features = get_veesel_sheets(feature_path, slide_basename, available_sheetnames, remove_others=True)
    if features.empty:
        continue  # Skip to if no relevant data

    measurements_path = os.path.join(MEASUREMENTS_DIR, f"{slide_basename}{suffix}.json")
    measurements = get_measurements(measurements_path, clean=True)

    for disease_type in DISEASE_TYPES:
        features[f"{disease_type} Severity"] = features[f"{disease_type} Severity"].map(CLASSIFICATION_SEVERITY_MAPPING)

    for artery_type in ARTERY_TYPES:
        for feature_name in base_feature_names + measurement_feature_names:
            features_series = features[(features['Artery Type'] == artery_type)][feature_name]
            if not features_series.empty:            
                max_severity = features_series.max()
                mean_severity = features_series.mean()
                median_severity = features_series.median()
                percentile_75th = features_series.quantile(0.75)
                percentile_25th = features_series.quantile(0.25)
            else:
                max_severity = mean_severity = median_severity = percentile_75th = percentile_25th =  None  # or another indicator for no data

            # Storing these values in the DataFrame
            pat_df.loc[index, f'Max_{feature_name}_in_{artery_type}'.replace(" ", "_")] = max_severity
            pat_df.loc[index, f'Mean_{feature_name}_in_{artery_type}'.replace(" ", "_")] = mean_severity
            pat_df.loc[index, f'Median_{feature_name}_in_{artery_type}'.replace(" ", "_")] = median_severity
            pat_df.loc[index, f'75th_{feature_name}_in_{artery_type}'.replace(" ", "_")] = percentile_75th
            pat_df.loc[index, f'25th_{feature_name}_in_{artery_type}'.replace(" ", "_")] = percentile_25th

pat_df.to_csv(agg_feature_path, index=False)