In [None]:
import os

import prismtoolbox as ptb
import prismtoolbox.wsiemb as ptb_emb

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator
from pathlib import Path

In [None]:
df = pd.read_csv("../materials/dataset_breast_final_v2.csv")
df

# Daisy clustering analysis

In [None]:
output_dir = "../Results/BREAST/clustering/Daisy_analysis"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
emb_type = "clam"

In [None]:
slides_to_analyse = pd.read_csv(f"../materials/slides_to_analyse_DAISY_analysis_BREAST.csv")
slide_names = slides_to_analyse["slide_name"].values
feats_dir = f"../Results/BREAST/clustering/feats_80_{emb_type}/"
feats_files = [os.path.join(feats_dir, slide_name+".pt") for slide_name in slide_names]
emb_processor = ptb_emb.EmbeddingProcessor(feats_files, slide_ids=slide_names, cmap="Set1")

In [None]:
optimal_nb, scores = emb_processor.get_optimal_number_clusters("kmeans_mini_batch",
                                                                init="k-means++",
                                                                init_size = 3000,
                                                                batch_size=1000,
                                                                n_init=500,
                                                                max_no_improvement=100,
                                                                min_clusters=7,
                                                                max_clusters=12,
                                                                metric_name="davies_bouldin",
                                                                normalize=True,
                                                                random_state=1)
optimal_nb, scores

In [None]:
pd.DataFrame(scores).to_csv(os.path.join(output_dir, f"scores_{emb_type}.csv"), index=False)

In [None]:
emb_processor.create_cluster_model("kmeans_mini_batch",
                                    n_clusters=optimal_nb,
                                    init="k-means++",
                                    init_size = 3000,
                                    batch_size=1000,
                                    n_init=500,
                                    max_no_improvement=100,
                                    random_state=1)

In [None]:
emb_processor.save_cluster_model(os.path.join(output_dir, f"cluster_model_{emb_type}.pkl"))

In [None]:
emb_processor.import_cluster_model(os.path.join(output_dir, f"cluster_model_{emb_type}.pkl"))

In [None]:
cluster_percentages = [emb_processor.get_cluster_percentages_for_slide(slide_name.split(".")[0]+"") for slide_name in df[df["ihc_slide_id"].str.replace(".svs", "").isin(slide_names)]["ihc_slide_id"].values]
labels = df[df["ihc_slide_id"].str.replace(".svs", "").isin(slide_names)].ORR.map({"Complete response": "Responder", "Partial response": "Responder", "Progressive disease": "Non-responder", "Stable disease": "Non-responder"}).values
labels

In [None]:
df_plot = pd.DataFrame(cluster_percentages)
df_plot["Response to treatment"] = labels
df_plot.iloc[:,:-1] *= 100
df_plot.to_csv(os.path.join(output_dir, f"cluster_percentages_{emb_type}.csv"), index=False)
df_plot = df_plot.melt(id_vars='Response to treatment', var_name='Cluster', value_name='Percentage')
df_plot.to_csv(os.path.join(output_dir, f"cluster_percentages_{emb_type}_melted.csv"), index=False)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.boxplot(x='Cluster', y='Percentage', hue='Response to treatment', data=df_plot)
ax.set_title('Boxplot of Cluster Percentages Stratified  by Response to Treatment')
plt.savefig(os.path.join(output_dir, f"boxplot_{emb_type}.png"), dpi=300, bbox_inches="tight")

# New clustering analysis

In [None]:
output_dir = "../Results/BREAST/clustering/New_cluster_analysis"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
with_DAB = False
suffix = "DAB" if with_DAB else "only"

In [None]:
reg_HE2IHC = "../Results/BREAST/clustering/reg_HE2IHC/"
slides = []
error_HE2IHC = []
for slide in os.listdir(reg_HE2IHC):
    slides.append(slide)
    df_HE2IHC = pd.read_csv(os.path.join(reg_HE2IHC, slide, "data", f"{slide}_summary.csv"))
    error_HE2IHC.append(df_HE2IHC["non_rigid_rTRE"].dropna().values[0])

In [None]:
df_reg_error = pd.DataFrame({"slide": slides, "error_HE2IHC": error_HE2IHC})
df_reg_error.sort_values(by="error_HE2IHC", ascending=False).head(10)

In [None]:
slides_to_analyse = pd.read_csv(f"../materials/slides_to_analyse_New_cluster_analysis_BREAST.csv")
slide_names = slides_to_analyse["slide_name"].values
feats_dir = "../Results/BREAST/clustering/feats_128_cell_stain_based_reg_warped_merged_HE/"
feats_files = [os.path.join(feats_dir, slide_name+".pt") for slide_name in slide_names]
emb_processor = ptb_emb.EmbeddingProcessor(feats_files, embeddings_names=os.path.join(feats_dir, "embeddings_names.csv"), slide_ids=slide_names, cmap="Set1")#

In [None]:
if with_DAB:
    selected_feats = [name for name in emb_processor.embeddings_names if "cells" in name or "DAB" in name]
else:
    selected_feats = [name for name in emb_processor.embeddings_names if "cells" in name]

In [None]:
optimal_nb, scores = emb_processor.get_optimal_number_clusters("kmeans",
                                          selected_features=selected_feats,
                                          n_init=20,
                                          min_clusters=7,
                                          max_clusters=12,
                                          metric_name="calinski_harabasz",
                                          normalize=True,
                                          random_state=1)
optimal_nb, scores

In [None]:
pd.DataFrame(scores).to_csv(os.path.join(output_dir, f".scores_cells_feats_{suffix}.csv"), index=False)

In [None]:
emb_processor.create_cluster_model("kmeans", 
                                   normalize=True,
                                   n_clusters=optimal_nb,
                                   selected_features=selected_feats,
                                   n_init=20,
                                   random_state=1)

In [None]:
emb_processor.save_cluster_model(os.path.join(output_dir, f"cluster_model_cells_feats_{suffix}.pkl"))

In [None]:
emb_processor.import_cluster_model(os.path.join(output_dir, f"cluster_model_cells_feats_{suffix}.pkl"))

In [None]:
cluster_percentages = [emb_processor.get_cluster_percentages_for_slide(slide_name.split(".")[0]+"", selected_features=selected_feats) for slide_name in df[df["he_slide_id"].str.replace(".svs", "").isin(slide_names)]["he_slide_id"].values]
labels = df[df["he_slide_id"].str.replace(".svs", "").isin(slide_names)].ORR.map({"Complete response": "Responder", "Partial response": "Responder", "Progressive disease": "Non-responder", "Stable disease": "Non-responder"}).values
labels

In [None]:
df_plot = pd.DataFrame(cluster_percentages)
df_plot["Response to treatment"] = labels
df_plot.iloc[:,:-1] *= 100
pd.concat([df_plot, df[df.he_slide_id.isin(slides_to_analyse)][["TdxNumber ", "Additional Specimen ID"]].reset_index(drop=True)], axis=1).to_csv("cluster_percentage_without_dab_with_info_patients.csv", index=False)
df_plot.to_csv(os.path.join(output_dir, f"cluster_percentages_cells_feats_{suffix}.csv"), index=False)
df_plot = df_plot.melt(id_vars='Response to treatment', var_name='Cluster', value_name='Percentage')
df_plot.to_csv(os.path.join(output_dir, f"cluster_percentages_cells_feats_{suffix}_melted.csv"), index=False)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.boxplot(x='Cluster', y='Percentage', hue='Response to treatment', data=df_plot)
ax.set_title('Boxplot of Cluster Percentages Stratified  by Response to Treatment')
plt.savefig(os.path.join(output_dir, f"boxplot_cells_feats_{suffix}.png"), dpi=300, bbox_inches="tight")

In [None]:
emb_mean, emb_std = (emb_processor.embeddings_stats["mean"], emb_processor.embeddings_stats["std"])
selected_feats_idx = np.array([i for i, name in enumerate(emb_processor.embeddings_names) if name in selected_feats])
features = pd.DataFrame(emb_processor.embeddings_matrix[:, selected_feats_idx], columns=selected_feats)
features['cluster'] = emb_processor.cluster_model.predict(((emb_processor.embeddings_matrix - emb_mean) / (emb_std + emb_processor.eps))[:, selected_feats_idx])
features_to_plot = [name for name in features.columns if "DAB_avg" in name or "N_cells" in name] + ["cluster"]
features.loc[:, features_to_plot].to_csv(os.path.join(output_dir, f"features_cells_feats_{suffix}.csv"), index=False)

In [None]:
cluster_idx = 0
slide_dir_HE = 'path to HE slides processed for registration'
for slide_HE in slides_to_analyse:
    slide_name_HE = slide_HE.split(".")[0]
    tdx_nb = slide_name_HE.split("-B0")[0]
    print(slide_name_HE)
    WSI_object = ptb.WSI(os.path.join(slide_dir_HE, f"{slide_name_HE}.ome.tif"), engine="tiffslide")
    WSI_object.load_patches(f"../Results/BREAST/clustering/patches_128_overlap_0_HE_warped/{slide_name_HE}.h5")
    cluster_assignments = emb_processor.get_cluster_assignments_for_slide(slide_name_HE, selected_features=selected_feats)
    idx = np.arange(len(WSI_object.coords))
    WSI_object.save_patches(
        os.path.join(output_dir, f"cluster_{cluster_idx}_geojson_{suffix}"),
        file_format="geojson",
        selected_idx=idx[cluster_assignments == 0],
        merge=False,
        label=f"cluster_{cluster_idx}",
        color=emb_processor.cluster_colors[cluster_idx].tolist(),
        append_to_existing_file=True,
    )

# Comparison with EOT for clustering analysis

In [None]:
output_dir = "../Results/BREAST_EOT"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
with_DAB = True
suffix = "DAB" if with_DAB else "only"

In [None]:
reg_HE2IHC = "../Results/BREAST_EOT/reg_HE2IHC/"
slides = []
error_HE2IHC = []
for slide in os.listdir(reg_HE2IHC):
    slides.append(slide)
    df_HE2IHC = pd.read_csv(os.path.join(reg_HE2IHC, slide, "data", f"{slide}_summary.csv"))
    error_HE2IHC.append(df_HE2IHC["non_rigid_rTRE"].dropna().values[0])

In [None]:
df_reg_error = pd.DataFrame({"slide": slides, "error_HE2IHC": error_HE2IHC})
df_reg_error.sort_values(by="error_HE2IHC", ascending=False).head(10)

In [None]:
df_baseline = pd.read_csv("../materials/dataset_breast_final_v2.csv")
df_eot = pd.read_csv("../materials/dataset_breast_eot.csv")

In [None]:
df_baseline = pd.read_csv("../materials/dataset_breast_final_v2.csv")
df_eot = pd.read_csv("../materials/dataset_breast_eot.csv")
df = pd.merge(df_eot, df_baseline.loc[:, ~df_baseline.columns.isin(["TdxNumber ", "ihc_slide_id", "he_slide_id"])], on="Additional Specimen ID", how="inner")
tdx_to_remove = pd.read_csv("../materials/tdx_to_remove_breast_eot.csv")["tdx_to_remove"].values
slides_to_analyse = df[~df["TdxNumber"].isin(tdx_to_remove)]["he_slide_id"].values
feats_dir = "../Results/BREAST_EOT/feats_128_cell_stain_based_reg_warped_merged_HE/"
feats_files = [os.path.join(feats_dir, file) for file in os.listdir(feats_dir) if (file.replace(".pt", ".svs") in slides_to_analyse)]
slide_names = [os.path.basename(file).split(".")[0] for file in feats_files if (os.path.basename(file).replace(".pt", ".svs") in slides_to_analyse)]
emb_processor = ptb_emb.EmbeddingProcessor(feats_files, embeddings_names=os.path.join(feats_dir, "embeddings_names.csv"), slide_ids=slide_names, cmap="Set1")

In [None]:
if with_DAB:
    selected_feats = [name for name in emb_processor.embeddings_names if "cells" in name or "DAB" in name]
else:
    selected_feats = [name for name in emb_processor.embeddings_names if "cells" in name]

In [None]:
emb_processor.import_cluster_model(os.path.join("../Results/BREAST/clustering/New_cluster_analysis", f"cluster_model_cells_feats_{suffix}.pkl"))

In [None]:
cluster_percentages = [emb_processor.get_cluster_percentages_for_slide(slide_name.split(".")[0]+"", selected_features=selected_feats) for slide_name in slides_to_analyse]
labels = df[df.he_slide_id.isin(slides_to_analyse)].ORR.map({"Complete response": "Responder", "Partial response": "Responder", "Progressive disease": "Non-responder", "Stable disease": "Non-responder"}).values
labels

In [None]:
df_plot = pd.DataFrame(cluster_percentages)
df_plot["Response to treatment"] = labels
df_plot.iloc[:,:-1] *= 100

In [None]:
pd.concat([df_plot, df[df.he_slide_id.isin(slides_to_analyse)][["TdxNumber", "Additional Specimen ID"]].reset_index(drop=True)], axis=1).to_csv("cluster_percentages_with_dab_with_info_patients_EOT.csv", index=False)

# Cell staining distribution analysis

In [None]:
output_dir = "../Results/BREAST/nuclei_IHC/Cell_staining_distribution"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
from sklearn.cluster import KMeans

In [None]:
labels = df.ORR.map({"Complete response": "Responder", "Partial response": "Responder", "Progressive disease": "Non-responder", "Stable disease": "Non-responder"}).values
labels

In [None]:
detection_dir = "../Results/BREAST/nuclei_IHC/QuPath/detection_measurements/"
slides = [file.replace(".csv", "") for file in os.listdir(detection_dir)]
detections = [pd.read_csv(os.path.join(detection_dir, file), sep="\t") for file in os.listdir(detection_dir)]

In [None]:
values = np.concatenate([detections[i]['DAB: Cell: Mean'].values for i in range(len(detections))])
kmeans = KMeans(n_clusters=2, random_state=0).fit(values.reshape(-1, 1))
kmeans.cluster_centers_.mean()

In [None]:
features = {slides[i]: detection[(detection["DAB: Cell: Mean"] > 0.14)&(detection["DAB: Cytoplasm: Min"] > 0.)]["DAB: Cytoplasm: Variance"].mean() for i, detection in enumerate(detections)}

In [None]:
reorganised_features = [features[slide_id.replace(".svs", "")] for slide_id in df["ihc_slide_id"].values]

In [None]:
df_plot = pd.DataFrame({"cytoplasm_dab_avg_var": reorganised_features})
df_plot["Response to treatment"] = labels
df_plot = df_plot.dropna()
df_plot.to_csv(os.path.join(output_dir, "cytoplasm_dab_avg_var.csv"), index=False)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.boxplot(x='Response to treatment', y='cytoplasm_dab_avg_var', data=df_plot)
ax.set_title('Boxplot of the average DAB variance measured in cytoplasm stratified  by response to treatment')
pairs=[("Non-responder", "Responder")]
annotator = Annotator(ax, pairs, data=df_plot, x="Response to treatment", y="cytoplasm_dab_avg_var")
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()
plt.savefig(os.path.join(output_dir, "boxplot_cytoplasm_dab_avg_var.png"), dpi=300, bbox_inches="tight")