In [2]:
%load_ext autoreload
%autoreload 2

import sys, os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))

from src.loading import DataLoader
from src.cleaning import DataCleaner
from src.weighting import DataWeighter
from src.similarity_gpu import NetworkBuilder
from src.clustering import NetworkClusterer

In [3]:
dl = DataLoader()
BP, CC, MF, HPO, DepthBP, DepthCC, DepthMF = dl.load_and_prepare_data()

Loading DATA from ../data/raw/
[OK] BP data loaded: (5183, 9873)
[OK] CC data loaded: (5183, 1478)
[OK] MF data loaded: (5183, 3258)
[OK] HPO data loaded: (5183, 10185)
[OK] DepthBP data loaded: (1, 9566)
[OK] DepthCC data loaded: (1, 1453)
[OK] DepthMF data loaded: (1, 3157)


In [4]:
dc = DataCleaner(BP, CC, MF, HPO, DepthBP, DepthCC, DepthMF)
BP, CC, MF, HPO, DepthBP, DepthCC, DepthMF = dc.clean_all()

[OK] BP data cleaned: (4153, 6115)
[OK] CC data cleaned: (4153, 880)
[OK] MF data cleaned: (4153, 1335)
[OK] HPO data cleaned: (4153, 6342)
[OK] DepthBP data cleaned: (1, 6013)
[OK] DepthCC data cleaned: (1, 867)
[OK] DepthMF data cleaned: (1, 1315)


In [5]:
dw = DataWeighter(BP, CC, MF, HPO, DepthBP, DepthCC, DepthMF)
BP_tfidf, CC_tfidf, MF_tfidf, HPO_tfidf = dw.transform_all()

=== Depth-Weighted TF-IDF ===

[BP] Applying depth weighting...
  [DEPTH] 102/6115 terms missing from depth file -> imputed with median depth (5.0)
[BP] Applying TF-IDF...
[OK] BP transformed: (4153, 6115)

[CC] Applying depth weighting...
  [DEPTH] 13/880 terms missing from depth file -> imputed with median depth (4.0)
[CC] Applying TF-IDF...
[OK] CC transformed: (4153, 880)

[MF] Applying depth weighting...
  [DEPTH] 20/1335 terms missing from depth file -> imputed with median depth (4.0)
[MF] Applying TF-IDF...
[OK] MF transformed: (4153, 1335)

[HPO] No depth file available â€” applying plain TF-IDF...
[OK] HPO transformed: (4153, 6342)

[SAVED] All TF-IDF matrices written to ../data/processed/


In [6]:
snf_builder = NetworkBuilder(BP_tfidf, CC_tfidf, MF_tfidf, HPO_tfidf)
fused = snf_builder.build_and_fuse()

=== Similarity Network Fusion (device: mps) ===

[BP] Feature matrix: (4153, 6115)
[CC] Feature matrix: (4153, 880)
[MF] Feature matrix: (4153, 1335)
[HPO] Feature matrix: (4153, 6342)

[SNF] Fusing 4 networks (K=20, t=20 iterations)...
[OK] Fused network: (4153, 4153)
[SAVED] ../data/processed/fused_network.csv


In [7]:
nc = NetworkClusterer(fused)
clusters_df = nc.run_and_save(dim_method='umap', clus_method='hdbscan',
                  n_components=2, n_neighbors=5,
                  min_cluster_size=4, min_samples=3,
                  cluster_selection_method='eom')

=== Inizio Fase di Clustering ===
[Riduzione] Metodo: UMAP | Componenti: 2


  warn("using precomputed metric; inverse_transform will be unavailable")
  warn(


[Clustering] Metodo: HDBSCAN
[OK] Clustering completato: 370 moduli identificati.
=== Validazione ===
Silhouette Score: 0.6700 (escluso rumore)

[SAVED] Risultati esportati in: ../data/results/gene_clusters_final.csv
