In [1]:
# Basic imports
import os
import sys
import cv2 # type: ignore
import pandas as pd # type: ignore
import numpy as np # type: ignore
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd

from docx import Document
from docx.shared import Inches

sys.path.append(os.path.abspath('..'))
from utils.utils_data import get_classifications
from utils.utils_survival import survival_analysis
from utils.utils_vis import save_fig
from utils.utils_constants import (ARTERY_TYPES,
                                   DISEASE_TYPES,
                                   CLASSIFICATION_SEVERITY_MAPPING, 
                                   VESSEL_NEPTUNE_PAT_INFO_PATH as VESSEL_PAT_INFO_PATH, 
                                   VESSEL_NEPTUNE_PAT_INFO_W_SCORE_PATH as VESSEL_PAT_INFO_W_SCORE_PATH,
                                   CLASSIFICATION_PATH, 
                                   COMBINED_CLASSIFICATION_PATH,
                                   ANALYSIS_DOC_PATH,
                                   CROPPED_VESSELS_DIR)

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
pat_df = pd.read_csv(VESSEL_PAT_INFO_PATH)

combined_classifications = pd.read_csv(COMBINED_CLASSIFICATION_PATH)
combined_classifications = combined_classifications[combined_classifications["Artery Type"] != "Others"]
combined_classifications['Artery Type'] = pd.Categorical(combined_classifications['Artery Type'], categories=ARTERY_TYPES)

for col in ['Arteriosclerosis Severity', 'Hyalinosis Severity']:
    combined_classifications[col] = combined_classifications[col].map(CLASSIFICATION_SEVERITY_MAPPING)


In [3]:
available_sheetnames = pd.ExcelFile(CLASSIFICATION_PATH).sheet_names

artery_counts_by_case = {artery_type: [] for artery_type in ['All Arteries'] + ARTERY_TYPES}

for i, (index, row) in enumerate(pat_df.iterrows()):
    slide_filename = row["WSI_Selected"]
    slide_basename = os.path.splitext(slide_filename)[0]

    classifications = get_classifications(CLASSIFICATION_PATH, slide_basename, available_sheetnames)

    if classifications.empty:
        continue  # Skip to if no relevant data
    classifications['Artery Type'] = pd.Categorical(classifications['Artery Type'], categories=ARTERY_TYPES)

    for col in ['Arteriosclerosis Severity', 'Hyalinosis Severity']:
        classifications[col] = classifications[col].map(CLASSIFICATION_SEVERITY_MAPPING)

    for col in ['Arteriosclerosis Severity', 'Hyalinosis Severity']:
        severity_by_type = {}
        for artery_type in ARTERY_TYPES:
            severity_series = classifications[classifications['Artery Type'] == artery_type][col]
            if not severity_series.empty:
                # Calculating various statistics
                max_severity = severity_series.max()
                mean_severity = severity_series.mean()
                median_severity = severity_series.median()
                percentile_75th = severity_series.quantile(0.75)
                non_zero_percentage = (severity_series > 0).sum() / len(severity_series)  # Percentage of non-zero severities
            else:
                max_severity = mean_severity = median_severity = percentile_75th = non_zero_percentage = -1  # or another indicator for no data

            # Storing these values in the DataFrame
            pat_df.loc[index, f'Max_{col}_in_{artery_type}'.replace(" ", "_")] = max_severity
            pat_df.loc[index, f'Mean_{col}_in_{artery_type}'.replace(" ", "_")] = mean_severity
            pat_df.loc[index, f'Median_{col}_in_{artery_type}'.replace(" ", "_")] = median_severity
            pat_df.loc[index, f'75th_{col}_in_{artery_type}'.replace(" ", "_")] = percentile_75th
            pat_df.loc[index, f'NonZeroPct_{col}_in_{artery_type}'.replace(" ", "_")] = non_zero_percentage

    artery_counts_by_case['All Arteries'].append(len(classifications))
    for artery_type in ARTERY_TYPES:
        count = (classifications['Artery Type'] == artery_type).sum()
        artery_counts_by_case[artery_type].append(count)

2024-06-25 18:23:32,108 - INFO - Sheet 12_26609_001_011 LUNK5 TRI not found in the classifications file.
2024-06-25 18:23:32,569 - INFO - Sheet 12_26609_001_502 LUNK TRI not found in the classifications file.
2024-06-25 18:23:34,405 - INFO - Sheet 13_26609_007_001 L10 TRI not found in the classifications file.
2024-06-25 18:23:34,407 - INFO - Sheet 0_957_A_0052826 not found in the classifications file.
2024-06-25 18:23:38,149 - INFO - Sheet 0_2878_A_0048364 not found in the classifications file.
2024-06-25 18:23:43,313 - INFO - Sheet 13_26609_022_023 L02 TRI not found in the classifications file.
2024-06-25 18:23:43,316 - INFO - Sheet 13_26609_022_024 L02 TRI not found in the classifications file.
2024-06-25 18:23:53,066 - INFO - Sheet 11_26609_025_503 L UNK TRI not found in the classifications file.
2024-06-25 18:23:54,390 - INFO - Sheet 13_26609_025_514 LUNK TRI not found in the classifications file.
2024-06-25 18:24:01,510 - INFO - Sheet 11_26609_027_509_L3_TRI not found in the clas

In [4]:
pat_df.to_csv(VESSEL_PAT_INFO_W_SCORE_PATH, index=False)

In [5]:
# Create a new Word document
doc = Document()
doc.add_heading('Artery Classification Report', level=0)
doc.add_page_break()

<docx.text.paragraph.Paragraph at 0x798c1aac0040>

In [6]:
def plot_artery_counts_by_case(artery_type, counts, ax):
    # Define the bins for the histogram - bins from 0 to 25, with an extra bin for values >25
    bins = np.arange(28)
    # Process counts to clip at 25
    processed_counts = [min(x, 26) for x in counts]  # Values greater than 25 are set to 25
    # Plot histogram with defined bins
    ax.hist(processed_counts, bins=bins, alpha=0.5, color='blue')
    # Labels for x-ticks, handling >25 as a special case
    labels = [str(i) for i in range(26)] + ['>25']
    # Set x-tick labels
    bin_width = bins[1] - bins[0]
    ax.set_xticks(np.arange(len(labels)) * bin_width + bin_width / 2)
    ax.set_xticklabels(labels, fontsize=15)  # Set font size for x-ticks
    ax.tick_params(axis='y', which='major', labelsize=15)  # Set font size for y-ticks

    # Set axis labels and title with context-specific information
    ax.set_xlabel(f'Count of {artery_type} per Whole Slide Image', fontsize=18)
    ax.set_ylabel('Frequency of Slides', fontsize=18)
    ax.set_title(f'Distribution of {artery_type} Across Slides', fontsize=20)

doc.add_heading(f'Section 1: Data Overview', level=1)
doc.add_paragraph(
    f"Initially, {len(pat_df)} slides were selected from the Neptune repository for this study, all of which have been either manually annotated "
    f"or predicted through deep learning and subsequently quality controlled. However, due to issues such as poor staining quality, "
    f"nephrectomy specimens, or the absence of arteries, {len(pat_df) - len(artery_counts_by_case['All Arteries'])} slides were discarded, leaving "
    f"{len(artery_counts_by_case['All Arteries'])} for analysis."
)
doc.add_paragraph(
    f"In total, N={np.sum(artery_counts_by_case['Arterioles'])} arterioles, {np.sum(artery_counts_by_case['Interlobular Arteries'])} interlobular arteries, "
    f"and {np.sum(artery_counts_by_case['Arcuate Arteries'])} arcuate arteries were segmented and visually scored (0-3) for arteriosclerosis and hyalinosis."
)

doc.add_heading('Histograms for Artery Counts', level=2)

doc.add_paragraph(
    "This section presents the distribution of artery counts per slides. "
    "Each histogram below represents the frequency of slides containing specific counts of each artery type."
)

for artery_type in ARTERY_TYPES:
    counts = artery_counts_by_case[artery_type]
    fig, ax = plt.subplots(figsize=(18, 5))  # Create a single subplot directly
    plot_artery_counts_by_case(artery_type, counts, ax)
    plot_filename = f"{artery_type.replace(' ', '_')}_count_analysis.png"
    save_fig(fig, plot_filename)
    doc.add_picture(plot_filename, width=Inches(6))
    os.remove(plot_filename)  # Optional: remove the file after adding to the document

In [7]:
# Count occurrences of each severity level within each Artery Type
def barplot(counts, col, ax):
    # Plotting directly on the provided axis
    counts.plot(kind='bar', ax=ax, legend=True)
    ax.set_title(f'Distribution of {col} by Artery Type', fontsize=20)
    ax.set_ylabel('Count', fontsize=15)
    ax.set_xlabel('')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30, fontsize=18)
    ax.set_yticklabels(ax.get_yticks(), fontsize=18)
    # Legend configuration
    ax.legend(title=col, fontsize=15, title_fontsize=15)

    # Annotating bars with their heights
    max_height = max(counts.max())
    ax.set_ylim(0, max_height * 1.1)  # Scale y-axis to fit annotations
    for p in ax.patches:
        ax.annotate(f"{int(p.get_height())}", (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='bottom', fontsize=10)
        
doc.add_heading('Bar Charts for Severity Distributions', level=2)
doc.add_paragraph(
    "The following figures illustrate the distribution of severity scores for Arteriosclerosis and Hyalinosis across different artery types. "
)

# Create a figure with subplots
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

for i, disease_type in enumerate(DISEASE_TYPES):
    # Calculate counts
    severity_counts = combined_classifications.groupby(['Artery Type', f'{disease_type} Severity']).size().unstack(fill_value=0)
    # Create the bar plot on the specified subplot
    barplot(severity_counts, f'{disease_type} Severity', axs[i])

# Adjust layout and save the figure
plt.tight_layout()
plot_filename = "disease_severity_distribution.png"
fig.savefig(plot_filename, format='png', bbox_inches='tight')
plt.close(fig)

# Assuming 'doc' is your Word document instance
doc.add_picture(plot_filename, width=Inches(6))
os.remove(plot_filename)  # Clean up the file after adding to the document
doc.add_page_break()

  severity_counts = combined_classifications.groupby(['Artery Type', f'{disease_type} Severity']).size().unstack(fill_value=0)
  ax.set_yticklabels(ax.get_yticks(), fontsize=18)
  severity_counts = combined_classifications.groupby(['Artery Type', f'{disease_type} Severity']).size().unstack(fill_value=0)
  ax.set_yticklabels(ax.get_yticks(), fontsize=18)


<docx.text.paragraph.Paragraph at 0x798c0d369100>

In [8]:
def distribution_analysis(df, col, ax, title, color):
    # Check if the data is continuous or discrete
    # Assuming data is discrete if unique values are few and all are integers
    data = df[col].dropna()
    unique_values = np.sort(data.unique())
    is_all_integers = all(value.is_integer() for value in unique_values)

    is_continuous = len(unique_values) > 7 and not is_all_integers
    
    if is_continuous:
        # Handle zeros separately if present
        zero_count = (data == 0).sum()
        non_zero_data = data[data != 0]
        max_val = non_zero_data.max()
        
        bins = np.linspace(0, max_val, 6)  # Create bins between 0 and max_val
        bins = np.insert(bins, 0, -np.finfo(float).eps)  # Start bins from zero

        # Bin the data
        data_binned = pd.cut(data, bins=bins, include_lowest=True, right=True)
        counts = data_binned.value_counts().sort_index()

        # Create labels for bins
        labels = ['0'] if zero_count > 0 else []  # Label for zero
        labels += [f"({bins[i]:.2f}, {bins[i+1]:.2f}]" for i in range(1, len(bins)-1)]

        # Plot the counts
        counts.plot(kind='bar', ax=ax, color=color, alpha=0.75)
        ax.set_xticks(np.arange(len(labels)))
        ax.set_xticklabels(labels, rotation=45)  # Rotate labels for better visibility
    else:
        if is_all_integers:
            unique_values = unique_values.astype(int)
        # Use bar chart for discrete data
        counts = data.value_counts().sort_index()
        counts.plot(kind='bar', ax=ax, color=color, alpha=0.75)
        ax.set_xticks(range(len(unique_values)))
        ax.set_xticklabels(unique_values, rotation=0)

    
    # Setting the labels and titles
    ax.set_xlabel('Severity Score', fontsize=15)
    ax.set_title(title, fontsize=18)
    ax.set_ylabel('Count', fontsize=15)
    ax.tick_params(axis='y', which='major', labelsize=15)

    # Annotate bars with counts
    for p in ax.patches:
        ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='center', xytext=(0, 10), textcoords='offset points')

def add_distribution_analysis_to_doc(doc, combined_classifications, pat_df_selected, 
                                     artery_type, disease_type, agg_metric, severity_column):
    fig = plt.figure(figsize=(18, 5))
    gs = gridspec.GridSpec(1, 3)
    ax1 = fig.add_subplot(gs[0])
    distribution_analysis(combined_classifications.loc[combined_classifications["Artery Type"] == artery_type], 
                            f'{disease_type} Severity', ax1, f'{artery_type} Count by Severity', '#87CEEB')

    ax2 = fig.add_subplot(gs[1])
    distribution_analysis(pat_df_selected, severity_column, ax2, "Case Count by Severity", "#F88379")

    ax3 = fig.add_subplot(gs[2])
    distribution_analysis(pat_df_selected.loc[pat_df_selected['ESRDorEGFR40BX_LR'] == 1, :], 
                            severity_column, ax3, "Event Count by Severity", "#D8BFD8")
    plt.tight_layout()
    # plt.show()

    plot_filename = f"{artery_type.replace(' ', '_')}_{disease_type}_{agg_metric}_analysis.png"
    save_fig(fig, plot_filename)
    doc.add_picture(plot_filename, width=Inches(6))
    os.remove(plot_filename)


# pat_df['DaysBXtoESRDorEGFR40_LR'] = pd.to_numeric(pat_df['DaysBXtoESRDorEGFR40_LR'], errors='coerce')
pat_df['ESRDorEGFR40BX_LR'] = pat_df['ESRDorEGFR40BX_LR'].map({'1: Yes': 1, '0: No': 0}).astype(int)

# Iterate over artery types
for sec_num, artery_type in enumerate(ARTERY_TYPES):
    doc.add_heading(f'Section {sec_num + 2}: {artery_type}', level=1)
    counts = artery_counts_by_case[artery_type]
    doc.add_paragraph(f"{np.sum(counts)} {artery_type} extracted from {np.sum(np.array(counts)>0)} slides. "
                      f"We already have {artery_type}-level scores, and we need to aggregate to the case level. "
                      "We investigate multiple aggregation metrics and their survival analysis impact.")

    for sub_sec_num, agg_metric in enumerate(["Max", "Median", "75th", "Mean", "NonZeroPct"]):
        doc.add_heading(f'Section {sec_num+2}.{sub_sec_num+1}: {artery_type}, Aggregated by {agg_metric}', level=2)
        for sub_sub_sec_num, disease_type in enumerate(["Arteriosclerosis", "Hyalinosis"]):
            doc.add_heading(f'{disease_type}', level=3)
            severity_column = f"{agg_metric}_{disease_type}_Severity" if artery_type == 'All Arteries' else \
                "_".join([agg_metric, disease_type, "Severity", "in", artery_type.replace(" ", "_")])
            pat_df_selected = pat_df.loc[pat_df[severity_column].notna() & (pat_df[severity_column] >= 0), :].copy()
            add_distribution_analysis_to_doc(doc, combined_classifications, pat_df_selected, 
                                             artery_type, disease_type, agg_metric, severity_column)
            
            thresholds = [0]
            if agg_metric == "Max":
                thresholds += [1, 2]
            else:
                non_zero_vals = pat_df_selected[severity_column][pat_df_selected[severity_column] > 0]
                if len(non_zero_vals) != 0: 
                    thresholds += [np.percentile(non_zero_vals, 25), np.median(non_zero_vals)]

            fig, axs = plt.subplots(1, 3, figsize=(22, 6))
            for i, th in enumerate(thresholds):
                if agg_metric == "Max":
                    label_1 = f"≤{int(th)}"
                    label_2 = f">{int(th)}"
                else:
                    percentile_label = "25th" if i == 1 else "median" if i == 2 else "0"
                    label_1 = f"≤ {percentile_label} ({th:.2f})"
                    label_2 = f"> {percentile_label} ({th:.2f})"

                groups = [(label_1, pat_df_selected[pat_df_selected[severity_column] <= th]),
                        (label_2, pat_df_selected[pat_df_selected[severity_column] > th])]
                survival_analysis(groups, axs[i])
                if i == 0:
                    axs[i].set_ylabel('Survival Probability', fontsize=15)
            plt.tight_layout()
            plot_filename = f"{artery_type.replace(' ', '_')}_{disease_type}_{agg_metric}_survival.png"
            save_fig(fig, plot_filename)
            doc.add_picture(plot_filename, width=Inches(6))
            os.remove(plot_filename)
    doc.add_page_break()

In [9]:

def gallery_view(images, titles, cols=5):
    # Number of images to show per page/view
    num_images = len(images)
    rows_per_view = 1  # Show one row at a time

    # Calculate the number of views needed
    total_views = (num_images + cols - 1) // cols
    plot_filenames = []  # To keep track of saved image file paths

    for view in range(total_views):
        start_index = view * cols
        end_index = min(start_index + cols, num_images)
        fig, axs = plt.subplots(rows_per_view, cols, figsize=(15, 5 * rows_per_view))
        axs = axs.ravel()
        for i in range(cols):
            index = start_index + i
            if index < end_index:
                image = images[index]
                # Rotate image if width is greater than height
                if image.shape[1] > image.shape[0]:  # image.shape gives (height, width, channels)
                    image = np.rot90(image)  # Rotate 90 degrees
                axs[i].imshow(image)
                axs[i].set_title(titles[index], fontsize=18)
                axs[i].axis('off')
            else:
                axs[i].axis('off')
        plt.tight_layout()
        # Save the figure to file
        plot_filename = f"gallery_view_{view}.png"
        save_fig(fig, plot_filename)
        plot_filenames.append(plot_filename)
    return plot_filenames


def load_images_for_type(artery_type, severity_column):
    images = []
    titles = []
    # Ensure combined_classifications DataFrame is defined correctly with the right columns
    for index, row in combined_classifications.loc[
        (combined_classifications["Artery Type"] == artery_type) &
        (combined_classifications[severity_column] > 0), :
    ].iterrows():
        # Construct the path to the image file
        image_path = os.path.join(CROPPED_VESSELS_DIR, artery_type, 
                                  row["Image Name"].replace(".png", "_w_ann.png"))
        img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Correct function to load the image
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct color display
            images.append(img)
            # Create a title using multiple fields from the DataFrame
            title = f"B{row['Image Name'].split('_')[1]}{row['Artery ID']}-AS: {row['Arteriosclerosis Severity']}; HS: {row['Hyalinosis Severity']}"
            titles.append(title)
    return images, titles

doc.add_heading(f'Appendix', level=1)
# Implementation in the document creation process
for artery_type, severity_column in [
    ("Arterioles", "Arteriosclerosis Severity"),
    ("Arcuate Arteries", "Hyalinosis Severity"),
    ("Interlobular Arteries", "Hyalinosis Severity")
]:
    doc.add_heading(f"{artery_type} with {severity_column} > 0", level=2)
    images, titles = load_images_for_type(artery_type, severity_column)
    plot_filenames = gallery_view(images, titles)
    for plot_filename in plot_filenames:
        doc.add_picture(plot_filename, width=Inches(6))
        os.remove(plot_filename)
    doc.add_page_break()


In [10]:
doc.save(ANALYSIS_DOC_PATH)

In [11]:
# pat_df_selected = pat_df.loc[
#      (pat_df["Max_Arteriosclerosis_Severity_in_Interlobular_Arteries"] >= 0) &
#      (pat_df["Max_Arteriosclerosis_Severity_in_Arcuate_Arteries"] >= 0)
# ].copy()
# doc.add_paragraph(
#         f"{len(pat_df_selected)} cases have both Interlobular and Arcuate present. "
#     )
# print(pat_df_selected.shape)
# pat_df_selected = pat_df_selected.loc[
#      (pat_df_selected["Max_Arteriosclerosis_Severity_in_Interlobular_Arteries"] > 0) |
#      (pat_df_selected["Max_Arteriosclerosis_Severity_in_Arcuate_Arteries"] > 0)
# ]
# doc.add_paragraph(f"{len(pat_df_selected)} cases have at least one artery type with arteriosclerosis severity greater than 0.")


In [12]:

# # Assuming pat_df_selected is already filtered as required
# fig = plt.figure(figsize=(12, 7))

# # Generate x values with a slight offset to prevent overlapping
# x_interlobular = np.arange(len(pat_df_selected)) 
# x_arcuate = np.arange(len(pat_df_selected)) 

# plt.scatter(x_interlobular, pat_df_selected['Max_Arteriosclerosis_Severity_in_Interlobular_Arteries'],
#             color='blue', alpha=0.6, edgecolor='black', marker='s', label='Interlobular Arteries')

# plt.scatter(x_arcuate, pat_df_selected['Max_Arteriosclerosis_Severity_in_Arcuate_Arteries'],
#             color='red', alpha=0.6, edgecolor='black', marker='^', label='Arcuate Arteries')

# # Add labels and title
# plt.xlabel('Index of Cases', fontsize=16)
# plt.ylabel('Max Arteriosclerosis Severity', fontsize=16)
# plt.title('Comparison of Arteriosclerosis Severity Between Artery Types', fontsize=18)

# # Explicitly set y-ticks to be only integers 0, 1, 2, 3
# plt.yticks([0, 1, 2, 3])
# plt.tick_params(axis='both', which='major', labelsize=14)

# # Add a legend
# plt.legend()

# # Add grid for better readability
# plt.grid(True, linestyle='--', alpha=0.6)

# # Show the plot
# plt.show()

# plot_filename = f"comparison.png"
# save_fig(fig, plot_filename)
# doc.add_picture(plot_filename, width=Inches(6))
# os.remove(plot_filename)  # Optional: remove the file after adding to the document



In [13]:
# # Count where Interlobular severity is greater than Arcuate severity
# count_greater = (pat_df_selected['Max_Arteriosclerosis_Severity_in_Interlobular_Arteries'] >
#                  pat_df_selected['Max_Arteriosclerosis_Severity_in_Arcuate_Arteries']).sum()

# # Count where Interlobular severity is less than Arcuate severity
# count_less = (pat_df_selected['Max_Arteriosclerosis_Severity_in_Interlobular_Arteries'] <
#               pat_df_selected['Max_Arteriosclerosis_Severity_in_Arcuate_Arteries']).sum()

# # Optionally, count where Interlobular severity is equal to Arcuate severity
# count_equal = (pat_df_selected['Max_Arteriosclerosis_Severity_in_Interlobular_Arteries'] ==
#                pat_df_selected['Max_Arteriosclerosis_Severity_in_Arcuate_Arteries']).sum()

# doc.add_paragraph(
#         f"Number of cases where Interlobular severity > Arcuate severity: {count_greater}. "
#     )
# doc.add_paragraph(
#         f"Number of cases where Interlobular severity < Arcuate severity: {count_less}. "
#     )
# # Print the results
# print("Number of cases where Interlobular severity > Arcuate severity:", count_greater)
# print("Number of cases where Interlobular severity < Arcuate severity:", count_less)
# print("Number of cases where Interlobular severity = Arcuate severity:", count_equal)