This notebook is used to just merge all the `.pickle` files for the feature extraction model.

---
# 0. - Imports and paths

In [2]:
import os
import pandas as pd
import matplotlib.font_manager as fm
%matplotlib inline
import matplotlib.pyplot as plt
import pickle
import numpy as np
import re
from collections import defaultdict

In [3]:
os.chdir("/disk2/user/gabgam/work/gigi_env/the_project/4_clustering_and_classification/")
print(os.getcwd())

/disk2/work/gabgam/gigi_env/the_project/4_clustering_and_classification


In [4]:
# Fetch DM Sans font file from https://github.com/google/fonts/blob/main/ofl/dmsans/DMSans%5Bopsz%2Cwght%5D.ttf or, better, https://fonts.google.com/specimen/DM+Sans
font_path = "../fonts/static/DMSans-Medium.ttf" 

# Add font to matplotlib
try:
    fm.fontManager.addfont(font_path)
    plt.rcParams['font.family'] = 'DM Sans'
except RuntimeError as e:
    print(f"Failed to load font: {e}")

# sns.set_theme(font_scale=1.5, style="whitegrid")
# biomodal_palette = ["#9CDBD9", "#C0DF16","#003B49","#05868E", "#ABAD9A", "#F87C56","#00DAEF","#50B37B"]
# sns.set_palette(biomodal_palette)

In [5]:
# # Test plot
# plt.figure(figsize=(6, 4))
# plt.title("Test Plot with DM Sans Font μ", fontsize=16)
# plt.plot([1, 2, 3], [4, 6, 8])
# plt.show()

In [6]:
model = "kimianet"

In [7]:
# PATH_TO_SAVED_FEATURES = f"../3_features_extraction/output/{model}/satac_C1_v3_allspots_&_visium_2022_FF_WG_10X_img_not_changed_allspots/" # must end with "/"
PATH_TO_SAVED_FEATURES = f"../3_features_extraction/output/{model}/satac_C1_v3_allspots_&_visium_FFPE_dcis_idc_10X_img_not_changed_allspots/" # must end with "/"

In [8]:
if PATH_TO_SAVED_FEATURES.endswith("/"):
    path_to_paired_pickles = f"output/{model}/{PATH_TO_SAVED_FEATURES.split('/')[-2]}/"
    print(path_to_paired_pickles)
else:
    print("Adapt the path string to the correct format (must ends with \"/\").")

os.makedirs(path_to_paired_pickles, exist_ok=True)

output/kimianet/satac_C1_v3_allspots_&_visium_FFPE_dcis_idc_10X_img_not_changed_allspots/


In [9]:
# path to the target image
PATH_TO_REFERENCE = "../2_image_normalisation/reference_images/reference_full.jpeg"
# `TARGET_IS_<filename>`
TARGET_IS = "target_is_reference_full"

# 1. - Merging the extracted features per normalisation method

First of all, let's see all the `.pickle` files inside the folder of the saved features.

In [10]:
# all_pickles = [i for i in os.listdir(PATH_TO_SAVED_FEATURES) if i.endswith(".pickle")]
all_pickles = [i for i in os.listdir(PATH_TO_SAVED_FEATURES) if i.endswith("_width1000.pickle")]

print(f"{len(all_pickles)} .pickle files inside the folder.\n")
print(all_pickles)

16 .pickle files inside the folder.

['satac_C1_&_v3_allspots_&_target_is_reference_full_100um_ORIGINAL WSI_width1000.pickle', 'satac_C1_&_v3_allspots_&_target_is_reference_full_100um_fromWSI_histomicsTK_macenko_nomasking_width1000.pickle', 'satac_C1_&_v3_allspots_&_target_is_reference_full_100um_histomicsTK_macenko_nomasking_width1000.pickle', 'satac_C1_&_v3_allspots_&_target_is_reference_full_100um_stainNET_width1000.pickle', 'satac_C1_&_v3_allspots_&_target_is_reference_full_68um_ORIGINAL WSI_width1000.pickle', 'satac_C1_&_v3_allspots_&_target_is_reference_full_68um_fromWSI_histomicsTK_macenko_nomasking_width1000.pickle', 'satac_C1_&_v3_allspots_&_target_is_reference_full_68um_histomicsTK_macenko_nomasking_width1000.pickle', 'satac_C1_&_v3_allspots_&_target_is_reference_full_68um_stainNET_width1000.pickle', 'visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots_&_target_is_reference_full_100um_ORIGINAL WSI_width1000.pickle', 'visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots_&_targe

## 1.1 - Merging (sATAC + Visium ST)

In [11]:
from utils_clust_n_class import load_pickle_as_df, save_pickle

# Dictionary to store files by (tile size, normalization method, target name)
grouped_files = defaultdict(dict)

# Regular expression to extract dataset, tile size, normalization method, and ensure the target name is "target_is_reference_full"
pattern = re.compile(rf'(satac|visium).*?_{TARGET_IS}_(\d+um)_(.*)\.pickle')

for file in all_pickles:
    match = pattern.search(file)
    if match:
        dataset, tile_size, method = match.groups()
        grouped_files[(tile_size, method)][dataset] = file


# Process and merge pairs
for (tile_size, method), group in grouped_files.items():
    if 'satac' in group and 'visium' in group:
        satac_file = group['satac']
        visium_file = group['visium']

        print(f"Processing pair:\nSATAC: {satac_file}\nVISIUM: {visium_file}\n")

        # Load both files as DataFrames and invert them before merging based on coloumns
        satac_df = load_pickle_as_df(os.path.join(PATH_TO_SAVED_FEATURES, satac_file)).T
        visium_df = load_pickle_as_df(os.path.join(PATH_TO_SAVED_FEATURES, visium_file)).T

        # Merge DataFrames (concatenation)
        merged_df = pd.concat([satac_df, visium_df], axis = 0)
        
        # Construct the merged filename
        satac_base = "_&_".join(os.path.splitext(satac_file)[0].split("_&_")[:2]) # extracting the satac name
        visium_base = "_&_".join(os.path.splitext(visium_file)[0].split("_&_")[:2]) # extracting the visium name
        normalisation_method = os.path.splitext(satac_file)[0].split("_&_")[-1] # extracting the target name and normalisation method
        
        MERGED_NAME = f"MERGED_{satac_base}__&__{visium_base}__for__{normalisation_method}.pickle"
        
        
        # Define output filename
        output_file = os.path.join(path_to_paired_pickles, MERGED_NAME)

        # Save the merged DataFrame
        save_pickle(merged_df, output_file)
        print(f"Saved merged file: {output_file}\n")

  from .autonotebook import tqdm as notebook_tqdm


Processing pair:
SATAC: satac_C1_&_v3_allspots_&_target_is_reference_full_100um_ORIGINAL WSI_width1000.pickle
VISIUM: visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots_&_target_is_reference_full_100um_ORIGINAL WSI_width1000.pickle

Saved merged file: output/kimianet/satac_C1_v3_allspots_&_visium_FFPE_dcis_idc_10X_img_not_changed_allspots/MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_full_100um_ORIGINAL WSI_width1000.pickle

Processing pair:
SATAC: satac_C1_&_v3_allspots_&_target_is_reference_full_100um_fromWSI_histomicsTK_macenko_nomasking_width1000.pickle
VISIUM: visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots_&_target_is_reference_full_100um_fromWSI_histomicsTK_macenko_nomasking_width1000.pickle

Saved merged file: output/kimianet/satac_C1_v3_allspots_&_visium_FFPE_dcis_idc_10X_img_not_changed_allspots/MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_

Let's visualise the names.

In [12]:
print([i for i in os.listdir(path_to_paired_pickles)])

['MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_full_100um_ORIGINAL WSI_width1000.pickle', 'MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_full_100um_fromWSI_histomicsTK_macenko_nomasking_width1000.pickle', 'MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_full_100um_histomicsTK_macenko_nomasking_width1000.pickle', 'MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_full_100um_stainNET_width1000.pickle', 'MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_full_68um_ORIGINAL WSI_width1000.pickle', 'MERGED_satac_C1_&_v3_allspots__&__visium_FFPE_dcis_idc_10X_&_img_not_changed_allspots__for__target_is_reference_full_68um_fromWSI_histomicsTK_macenko_nomasking_width1000.pickle', 'M