In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import hdbscan
from sklearn.cluster import DBSCAN

# Cargar datos
df = pd.read_csv("TCGA.BRCA.sampleMap_HiSeqV2_exon", sep="\t", index_col=0)
X = df.T

# Imputar valores faltantes
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Escalar
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Métricas de distancia
metrics = ['euclidean', 'manhattan', 'chebyshev', 'cosine', 'l1', 'l2', 'canberra', 'braycurtis']

# Rango de eps a probar para DBSCAN
eps_values = np.arange(0.1, 10.1, 0.1)

# Resultados
results = []

for metric in metrics:
    print(f"\n=== Métrica: {metric} ===")

    # HDBSCAN
    print("[HDBSCAN]")
    try:
        clusterer_h = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=5, metric=metric)
        labels_h = clusterer_h.fit_predict(X_pca)
        n_clusters_h = len(set(labels_h)) - (1 if -1 in labels_h else 0)
        noise_h = np.sum(labels_h == -1)
        print(f"Clusters: {n_clusters_h}, Ruido: {noise_h}")
        if n_clusters_h > 1:
            score_h = silhouette_score(X_pca[labels_h != -1], labels_h[labels_h != -1], metric='euclidean')
            print(f"Silhouette: {score_h:.3f}")
        else:
            score_h = None
            print("Silhouette no calculable")
        results.append({'model': 'HDBSCAN', 'metric': metric, 'eps': None, 'clusters': n_clusters_h, 'noise': noise_h, 'silhouette': score_h})
    except Exception as e:
        print(f"HDBSCAN falló con métrica {metric}: {e}")
        results.append({'model': 'HDBSCAN', 'metric': metric, 'eps': None, 'clusters': None, 'noise': None, 'silhouette': None})

    # DBSCAN
    for eps in eps_values:
        try:
            db = DBSCAN(eps=eps, min_samples=5, metric=metric)
            labels_d = db.fit_predict(X_pca)
            n_clusters_d = len(set(labels_d)) - (1 if -1 in labels_d else 0)
            noise_d = np.sum(labels_d == -1)
            if n_clusters_d > 1:
                score_d = silhouette_score(X_pca[labels_d != -1], labels_d[labels_d != -1], metric='euclidean')
            else:
                score_d = None
            results.append({
                'model': 'DBSCAN', 'metric': metric, 'eps': round(eps, 2),
                'clusters': n_clusters_d, 'noise': noise_d, 'silhouette': score_d
            })
        except Exception as e:
            print(f"DBSCAN falló con métrica {metric} y eps={eps:.2f}: {e}")
            results.append({
                'model': 'DBSCAN', 'metric': metric, 'eps': round(eps, 2),
                'clusters': None, 'noise': None, 'silhouette': None
            })

print("\n--- Resumen ---")
for r in results:
    model = r['model']
    metric = r['metric']
    eps = str(r['eps']) if r['eps'] is not None else 'N/A'
    clusters = r['clusters'] if r['clusters'] is not None else 'N/A'
    noise = r['noise'] if r['noise'] is not None else 'N/A'
    silhouette = f"{r['silhouette']:.3f}" if r['silhouette'] is not None else 'N/A'

    print(f"{model:>8} | Métrica: {metric:>10} | eps: {eps:>4} | Clusters: {clusters:>7} | Ruido: {noise:>4} | Silhouette: {silhouette}")



=== Métrica: euclidean ===
[HDBSCAN]
Clusters: 7, Ruido: 193




Silhouette: 0.270

=== Métrica: manhattan ===
[HDBSCAN]
Clusters: 4, Ruido: 213
Silhouette: 0.300





=== Métrica: chebyshev ===
[HDBSCAN]
Clusters: 9, Ruido: 307
Silhouette: 0.130





=== Métrica: cosine ===
[HDBSCAN]
HDBSCAN falló con métrica cosine: Unrecognized metric 'cosine'





=== Métrica: l1 ===
[HDBSCAN]
Clusters: 4, Ruido: 213
Silhouette: 0.300





=== Métrica: l2 ===
[HDBSCAN]
Clusters: 7, Ruido: 193
Silhouette: 0.270





=== Métrica: canberra ===
[HDBSCAN]
Clusters: 4, Ruido: 5
Silhouette: 0.276





=== Métrica: braycurtis ===
[HDBSCAN]
Clusters: 3, Ruido: 91
Silhouette: -0.204





--- Resumen ---
 HDBSCAN | Métrica:  euclidean | eps:  N/A | Clusters:       7 | Ruido:  193 | Silhouette: 0.270
  DBSCAN | Métrica:  euclidean | eps:  0.1 | Clusters:       0 | Ruido: 1218 | Silhouette: N/A
  DBSCAN | Métrica:  euclidean | eps:  0.2 | Clusters:       0 | Ruido: 1218 | Silhouette: N/A
  DBSCAN | Métrica:  euclidean | eps:  0.3 | Clusters:       0 | Ruido: 1218 | Silhouette: N/A
  DBSCAN | Métrica:  euclidean | eps:  0.4 | Clusters:       0 | Ruido: 1218 | Silhouette: N/A
  DBSCAN | Métrica:  euclidean | eps:  0.5 | Clusters:       0 | Ruido: 1218 | Silhouette: N/A
  DBSCAN | Métrica:  euclidean | eps:  0.6 | Clusters:       0 | Ruido: 1218 | Silhouette: N/A
  DBSCAN | Métrica:  euclidean | eps:  0.7 | Clusters:       1 | Ruido: 1213 | Silhouette: N/A
  DBSCAN | Métrica:  euclidean | eps:  0.8 | Clusters:       3 | Ruido: 1201 | Silhouette: 0.888
  DBSCAN | Métrica:  euclidean | eps:  0.9 | Clusters:       5 | Ruido: 1191 | Silhouette: 0.898
  DBSCAN | Métrica:  euclid

In [None]:
### Valores mas altos

##DBSCAN | Métrica:         l2 | eps:  0.8 | Clusters:       3 | Ruido: 1201 | Silhouette: 0.888
 # DBSCAN | Métrica:         l2 | eps:  0.9 | Clusters:       5 | Ruido: 1191 | Silhouette: 0.898
 #  DBSCAN | Métrica:  manhattan | eps:  0.9 | Clusters:       2 | Ruido: 1208 | Silhouette: 0.938
 # DBSCAN | Métrica:  manhattan | eps:  1.0 | Clusters:       3 | Ruido: 1201 | Silhouette: 0.888
  #DBSCAN | Métrica:  manhattan | eps:  1.1 | Clusters:       3 | Ruido: 1200 | Silhouette: 0.879
  #DBSCAN | Métrica:  manhattan | eps:  1.2 | Clusters:       6 | Ruido: 1184 | Silhouette: 0.864
#  DBSCAN | Métrica:  chebyshev | eps:  0.7 | Clusters:       2 | Ruido: 1208 | Silhouette: 0.902
#   DBSCAN | Métrica:  chebyshev | eps:  0.8 | Clusters:       4 | Ruido: 1192 | Silhouette: 0.810
#  DBSCAN | Métrica:  chebyshev | eps:  0.9 | Clusters:      10 | Ruido: 1161 | Silhouette: 0.804
#  DBSCAN | Métrica:         l1 | eps:  0.9 | Clusters:       2 | Ruido: 1208 | Silhouette: 0.938
#  DBSCAN | Métrica:         l1 | eps:  1.0 | Clusters:       3 | Ruido: 1201 | Silhouette: 0.888
#  DBSCAN | Métrica:         l1 | eps:  1.1 | Clusters:       3 | Ruido: 1200 | Silhouette: 0.879
#  DBSCAN | Métrica:         l1 | eps:  1.2 | Clusters:       6 | Ruido: 1184 | Silhouette: 0.864
# DBSCAN | Métrica:         l2 | eps:  0.8 | Clusters:       3 | Ruido: 1201 | Silhouette: 0.888
  # DBSCAN | Métrica:         l2 | eps:  0.9 | Clusters:       5 | Ruido: 1191 | Silhouette: 0.898
#
# Crear y ajustar los modelos DBSCAN con eps seleccionados
selected_eps_values = [0.9, 1.0, 1.1, 1.2]  # Puedes incluir más valores según lo observado
models = {}

# Asegúrate de que patient_ids está definido correctamente
patient_ids = X.index.tolist()  # Si X es el DataFrame original transpuesto (df.T)

for eps in selected_eps_values:
    db = DBSCAN(eps=eps, min_samples=5, metric='manhattan')
    labels = db.fit_predict(X_pca)
    models[eps] = labels

    # Agrupar pacientes por clúster
    clusters = {}
    for idx, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(patient_ids[idx])

    # Guardar pacientes por clúster en archivo
    filename = f"dbscan_manhattan_eps{eps:.1f}_clusters.csv"
    with open(filename, "w") as f:
        for cluster_id, patients in clusters.items():
            cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Ruido"
            f.write(f"{cluster_name} ({len(patients)} pacientes):\n")
            for pid in patients:
                f.write(f"{pid}\n")
            f.write("\n")

    print(f"Archivo guardado: {filename} con {len(clusters)} clusters")


Archivo guardado: dbscan_manhattan_eps0.9_clusters.csv con 3 clusters
Archivo guardado: dbscan_manhattan_eps1.0_clusters.csv con 4 clusters
Archivo guardado: dbscan_manhattan_eps1.1_clusters.csv con 4 clusters
Archivo guardado: dbscan_manhattan_eps1.2_clusters.csv con 7 clusters


In [None]:
import pandas as pd

# Cargar metadata
metadata = pd.read_excel("MetaData.xlsx")

# Leer archivo de clusters generado por DBSCAN
filename = "dbscan_manhattan_eps1.2_clusters.csv"

# Leer archivo de texto y extraer clusters
clusters = {}
current_cluster = None
with open(filename, "r") as f:
    for line in f:
        line = line.strip()
        if line.startswith("Cluster") or line.startswith("Ruido"):
            if line.startswith("Ruido"):
                current_cluster = -1
            else:
                parts = line.split()
                current_cluster = int(parts[1])
            clusters[current_cluster] = []
        elif line:  # línea con un ID
            clusters[current_cluster].append(line)

# Función para convertir formato del ID en metadata al formato del cluster
def convertir_id_tcga(id_original):
    partes = id_original.split(".")
    if len(partes) >= 4:
        return f"{partes[0]}-{partes[1]}-{partes[2]}-{partes[3][:2]}"
    return None

# Aplicar conversión a metadata
metadata["converted_id"] = metadata.iloc[:, -1].astype(str).apply(convertir_id_tcga)
metadata = metadata.dropna(subset=["converted_id"])
id_to_metadata = metadata.set_index("converted_id").to_dict(orient="index")

# Asociar pacientes por clúster y preparar salida
metadata_clusters = {k: [] for k in clusters}
all_cluster_ids = []
output_rows = []

for cluster_id, ids in clusters.items():
    for pid in ids:
        all_cluster_ids.append(pid)
        row_data = {"cluster_id": cluster_id, "paciente_id": pid}
        if pid in id_to_metadata:
            row_data.update(id_to_metadata[pid])
            metadata_clusters[cluster_id].append(id_to_metadata[pid])
        output_rows.append(row_data)

# Comparación
metadata_ids = set(id_to_metadata.keys())
matched = [pid for pid in all_cluster_ids if pid in metadata_ids]
unmatched = [pid for pid in all_cluster_ids if pid not in metadata_ids]

print(f"Total IDs en archivo de clústeres: {len(all_cluster_ids)}")
print(f"Coincidencias con metadata: {len(matched)}")
print(f"No encontrados: {len(unmatched)}")

# Mostrar pacientes por clúster
for cluster_id, pacientes in metadata_clusters.items():
    nombre_cluster = "Ruido" if cluster_id == -1 else f"Cluster {cluster_id}"
    print(f"\n--- {nombre_cluster} ({len(pacientes)} pacientes) ---")
    for paciente in pacientes:
        print(paciente)

# Crear DataFrame de salida
df_output = pd.DataFrame(output_rows)

# Guardar en CSV
output_filename = "metadata_por_cluster_eps1.2.csv"
df_output.to_csv(output_filename, index=False)

print(f"\n✅ Archivo exportado: {output_filename}")


Total IDs en archivo de clústeres: 1218
Coincidencias con metadata: 1184
No encontrados: 34

--- Ruido (1150 pacientes) ---
{'Patient ID': 'TCGA-A2-A3XU', 'Age at Diagnosis': 35, 'Sex': 'female', 'Race': 'Black or African American', 'Definition': 'Primary solid Tumor', 'Menopausal Status': 'postmenopausal', 'Cancer Type': 'IDC', 'ER': 'negative', 'PR': 'negative', 'HER2': 'not evaluated', 'Event': 'metastasis', 'Event Site': 'bone>breast recur|chest wall', 'Survival Status': 'dead', 'Survival Time (years)': 2.5, 'Subtype': 'Basal', 'Genetic Ancestry': 'African_admix', 'hidden': 'TCGA.A2.A3XU.01A.12R.A22U.07'}
{'Patient ID': 'TCGA-AR-A2LR', 'Age at Diagnosis': 49, 'Sex': 'female', 'Race': 'White', 'Definition': 'Primary solid Tumor', 'Menopausal Status': 'postmenopausal', 'Cancer Type': 'metaplastic carcinoma', 'ER': 'negative', 'PR': 'negative', 'HER2': 'negative', 'Event': nan, 'Event Site': nan, 'Survival Status': 'alive', 'Survival Time (years)': 2.32, 'Subtype': nan, 'Genetic Ances

In [None]:

filename = pd.read_csv("dbscan_manhattan_eps1.2_clusters.csv")

filename.head()

Unnamed: 0,Ruido (1184 pacientes):
0,TCGA-A2-A3XU-01
1,TCGA-AR-A2LR-01
2,TCGA-B6-A3ZX-01
3,TCGA-OL-A5D7-01
4,TCGA-E2-A570-01
