In [5]:
import os  # Per la gestione di file e directory
import torch  # Per il modello e i tensori
import torch.nn.functional as F  # Per interpolazione delle mappe di attenzione
from torchvision.transforms import Compose, Resize, ToTensor  # Preprocessamento delle immagini
from PIL import Image  # Per caricare immagini
import matplotlib.pyplot as plt  # Per visualizzare heatmap e risultati
import numpy as np  # Per manipolazioni numeriche
from sklearn.cluster import KMeans  # Per clustering delle feature
from conch.open_clip_custom import create_model_from_pretrained, tokenize, get_tokenizer
from pathlib import Path
import cv2
import json
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import utils
import shutil
import pandas as pd
import pickle


# show all jupyter output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using GPU')
else:
    device = torch.device('cpu')
    print('Using CPU')

# Set the random seed for reproducibility
torch.manual_seed(42)  # For CPU
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)  # For GPU

Using GPU


<torch._C.Generator at 0x147b6c1f83f0>

Import the pretrained model

In [6]:
#Import model

model, preprocess = create_model_from_pretrained('conch_ViT-B-16', "hf_hub:MahmoodLab/conch", hf_auth_token="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", device=device)
_ = model.eval()

Prepare the textual data

In [7]:
#load the classes and descriptors
classifier_folder="classifiers_tumor"
all_descriptors=json.load(open("classifiers/"+ classifier_folder + "/all_descriptors.json", 'r'))
tumor_classes=json.load(open("classifiers/"+ classifier_folder + "/tumor_classes.json", 'r'))
print(all_descriptors)


#build mapping from classes to id
class_index_file= json.load(open('classifiers/class_index.json','r'))
folder2id={v[0]:int(k) for k,v in class_index_file.items()}
print(folder2id)


['Irregular mucosal surface', 'Ulceration', 'Thickened esophageal wall', 'Loss of normal vascular pattern', 'White plaques or keratinization', 'Narrowed lumen', 'Asymmetric growth or mass', 'Hypervascularity', 'Increased contrast uptake in imaging', 'Infiltrative margins', 'Irregular glandular structures', 'Mucinous or necrotic areas', 'Thickened esophageal wall', 'Ulcerated or fungating mass', 'Loss of normal mucosal pattern', "Barrett's esophagus background", 'Hypervascularity', 'Narrowed or obstructed lumen', 'Heterogeneous contrast enhancement', 'Infiltrative or exophytic growth pattern']
{'Esophageal adenocarcinoma (ESAD)': 0, 'Esophageal squamous cell carcinoma (ESCC)': 1}


Compute the textual embeddings and the classes/descriptors similarity matrix 

In [8]:
#tokenize
tokenizer = get_tokenizer()
classes_tokens = tokenize(texts=tumor_classes, tokenizer=tokenizer).to(device)
descriptors_tokens= tokenize(texts=all_descriptors, tokenizer=tokenizer ).to(device)



#compute embeddings and similarity matrix, with the relevance between each class and descriptor
with torch.inference_mode():
    classes_embeddings = model.encode_text(classes_tokens)
    descriptors_embeddings = model.encode_text(descriptors_tokens)
    print("Classes embeddings shape:", classes_embeddings.shape)
    print("Descriptors embeddings shape:",descriptors_embeddings.shape)
    similarity_matrix=classes_embeddings @ descriptors_embeddings.T  # Shape: (num_classi, num_concetti)
    print("Sim Matrix shape:", similarity_matrix.shape)
   

Classes embeddings shape: torch.Size([2, 512])
Descriptors embeddings shape: torch.Size([20, 512])
Sim Matrix shape: torch.Size([2, 20])


Prepare the visual data

In [15]:
#load the patch embeddings in two different directories, according to their labels: 0-> ESAD, 1->ESCC

# file CSV
csv_path = "/homes/fmorandi/ai4bio_project/dataset_esca.csv"

# destination directories
esad_dir = "/homes/fmorandi/ai4bio_project/images/esad"
escc_dir = "/homes/fmorandi/ai4bio_project/images/escc"
'''
if not os.path.exists(esad_dir):
    os.makedirs(esad_dir)

if not os.path.exists(escc_dir):
    os.makedirs(escc_dir)

# load the dataset as dataframe
try:
    df = pd.read_csv(csv_path)
    print("CSV caricato con successo!")
except Exception as e:
    print(f"Errore nel caricamento del CSV: {e}")
    raise
print(df.columns)


if "slide" not in df.columns or "labels" not in df.columns:
    raise ValueError("Il CSV deve contenere le colonne 'filename' e 'label'")


for _, row in df.iterrows():
    file_name = row["slide"]
    label = row["labels"]
    
    
    src_path = file_name  # originale file path es: /work/h2020deciderficarra_shared/fmorandi/data/feats_conch_slide/TCGA-LN-A7HY-01Z-00-DX1_0.pkl

    
    # Determines right directory
    if label == 0:
        dest_path = os.path.join(esad_dir, os.path.basename(file_name))
    elif label == 1:
        dest_path = os.path.join(escc_dir, os.path.basename(file_name))
    else:
        continue
    
    # Copiees file in the right directory
    if os.path.exists(src_path):
        shutil.copy(src_path, dest_path)
        print(f"Copiato: {src_path} -> {dest_path}")
    else:
        print(f"File non trovato: {src_path}")
print("Copia completata!")

'''

'\nif not os.path.exists(esad_dir):\n    os.makedirs(esad_dir)\n\nif not os.path.exists(escc_dir):\n    os.makedirs(escc_dir)\n\n# load the dataset as dataframe\ntry:\n    df = pd.read_csv(csv_path)\n    print("CSV caricato con successo!")\nexcept Exception as e:\n    print(f"Errore nel caricamento del CSV: {e}")\n    raise\nprint(df.columns)\n\n\nif "slide" not in df.columns or "labels" not in df.columns:\n    raise ValueError("Il CSV deve contenere le colonne \'filename\' e \'label\'")\n\n\nfor _, row in df.iterrows():\n    file_name = row["slide"]\n    label = row["labels"]\n    \n    \n    src_path = file_name  # originale file path es: /work/h2020deciderficarra_shared/fmorandi/data/feats_conch_slide/TCGA-LN-A7HY-01Z-00-DX1_0.pkl\n\n    \n    # Determines right directory\n    if label == 0:\n        dest_path = os.path.join(esad_dir, os.path.basename(file_name))\n    elif label == 1:\n        dest_path = os.path.join(escc_dir, os.path.basename(file_name))\n    else:\n        contin

In [22]:
#see how an embedding is made
import pickle

pkl_path = "/homes/fmorandi/ai4bio_project/images/escc/TCGA-2H-A9GG-01Z-00-DX1_1.pkl"


with open(pkl_path, "rb") as f:
    data = pickle.load(f)

print(f"Tipo di dato caricato: {type(data)}")
print("Chiavi presenti nel dizionario:", data.keys())
print(data["region"].shape) #minore risoluzione ->usa questo -> shape (N variabile, 512)
print(data["patch"].shape) #maggiore risoluzione



Tipo di dato caricato: <class 'dict'>
Chiavi presenti nel dizionario: dict_keys(['region', 'patch'])
torch.Size([186, 512])
torch.Size([186, 8, 8, 512])
