In [8]:
import os
import warnings
import yaml
import numpy as np
import pandas as pd
import sklearn
import optuna
import torch
import torch.nn as nn

from hdbscan import HDBSCAN
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler



In [None]:
file_path = "F:/졸업 후 연구/LatentGEE/Data"

dat = pd.read_csv(os.path.join(file_path, "insight.merged_otus.txt"), sep="\t", encoding = "utf-8")
dat_meta = pd.read_excel(os.path.join(file_path, "SupplementaryMaterial.xlsx"), header = 1, usecols = "B:F")
dat_seqID = dat_meta["SeqID"].astype(str).to_list()
dat_T = dat[dat_seqID].T
dat_T.reset_index(inplace = True)
dat_T.columns = dat_T.iloc[0].astype(str)
dat_T = dat_T.iloc[1:]
dat_T.columns.values[0] = "SeqID"
dat_merged = pd.merge(dat_T, dat_meta[["SeqID", "Study.1", "Study", "hivstatus", "Age"]], on = "SeqID", how = "inner")

X_tensor = dat_T.iloc[:, 1:].apply(pd.to_numeric, errors="coerce")
X_tensor = torch.tensor(X_tensor.values, dtype = torch.float32)



In [18]:
dat_meta.head()

Unnamed: 0,Study,Study.1,SeqID,hivstatus,Age
0,Dillon,Dillion.2014,SRR1159399,0,27.0
1,Dillon,Dillion.2014,SRR1159402,1,48.0
2,Dillon,Dillion.2014,SRR1159405,1,25.0
3,Dillon,Dillion.2014,SRR1159408,0,29.0
4,Dillon,Dillion.2014,SRR1159411,1,58.0


In [17]:
dat.head()

Unnamed: 0,Resphera Insight (Raw Counts),4EP1,4EP10,4EP12,4EP15,4EP16,4EP17,4EP19,4EP20,4EP21,...,rgn.8.9,rgn.9.10,rgn.9.11,rgn.9.12,rgn.9.13,rgn.9.2,rgn.9.3,rgn.9.6,rgn.9.9,taxonomy
0,Abiotrophia_defectiva,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...
1,Acetatifactor_muris:Blautia_coccoides:Blautia_...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,k__Bacteria; p__Firmicutes; c__Clostridia; o__...
2,Acetitomaculum_ruminis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,k__Bacteria; p__Firmicutes; c__Clostridia; o__...
3,Acetobacter_fabarum:Acetobacter_ghanensis:Acet...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,k__Bacteria; p__Proteobacteria; c__Alphaproteo...
4,Acetobacterium_carbinolicum:Acetobacterium_fim...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,k__Bacteria; p__Firmicutes; c__Clostridia; o__...


In [9]:
print("torch ==", getattr(torch, "__version__", None))
print("numpy ==", getattr(np, "__version__", None))
print("scikit-learn ==", getattr(sklearn, "__version__", None))
print("optuna ==", getattr(optuna, "__version__", None))
print("hdbscan ==", getattr(HDBSCAN, "__version__", None))

torch == 2.9.0.dev20250827+cu128
numpy == 2.1.2
scikit-learn == 1.7.1
optuna == 4.5.0
hdbscan == None


In [None]:

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
def get_dataset_for_cutoff(cutoff: float):
    """
    cutoff별 X_tensor를 CPU 텐서로 반환하고 input_dim을 리턴.
    1) 메모리 캐시 있으면 바로 사용
    2) 없으면 디스크 캐시(파일) 있나 확인 -> 로드
    3) 둘 다 없으면 원시데이터로부터 전처리 수행 -> 캐시/저장
    """
    key = f"zp_{cutoff:.4f}"
    if key in _DATASET_CACHE:
        X = _DATASET_CACHE[key]
        return X, X.shape[1]

    # (A) 디스크 캐시가 있다면:
    pkl_path = f".../preprocessed/hivrc_scene2_zp{cutoff:.2f}.pt"
    if os.path.exists(pkl_path):
        X = torch.load(pkl_path, map_location="cpu")
        _DATASET_CACHE[key] = X
        return X, X.shape[1]

    # (B) 원시데이터로부터 전처리 (여기서는 의사코드)
    # raw = load_raw(...)
    # X_np = preprocess_by_zero_prevalence(raw, cutoff=cutoff)  # np.ndarray (N,D)
    # X = torch.tensor(X_np, dtype=torch.float32)
    # torch.save(X, pkl_path)
    # _DATASET_CACHE[key] = X
    # return X, X.shape[1]
    raise FileNotFoundError(f"no dataset for cutoff={cutoff}; add preprocessing or cached file.")


# prevalence cutoff에 따른 zero 비율 계산 ---
def compute_zero_proportion_by_prevalence(otu_df, cutoffs):
    n_samples = otu_df.shape[0]
    proportions = []

    for cutoff in cutoffs:
        # prevalence 계산: 각 OTU의 nonzero 비율
        prevalence = (otu_df > 0).sum(axis=0) / n_samples

        # cutoff 미만인 OTU 제거
        filtered_df = otu_df.loc[:, prevalence >= cutoff]

        # 전체 zero 비율 계산
        zero_count = (filtered_df == 0).sum().sum()
        total_count = filtered_df.size
        zero_proportion = zero_count / total_count

        proportions.append(zero_proportion)

    return proportions

In [None]:
from latentgee.config import load_cfg, ModelConfig, TrainConfig, EvalConfig
from latentgee.pipeline import LatentGEEPipeline
from latentgee.core import train_vae

In [None]:
set_seed(42)

cfg = load_cfg("C:/Users/KOBIC/Documents/latentgee/experiments/config.yaml")

ss_model = cfg["search_space"]["model"]
ss_train = cfg["search_space"]["train"]
ss_cluster = cfg["search_space"]["clustering"]
ss_eval = cfg["search_space"]["eval"]


cutoff = cfg["data"]["zero_prevalence_cutoff"][0]
X_tensor, input_dit = get_dataset_for_cutoff(cutoff)

X = X_tensor.numpy().astype("float32")

y_batch = dat_meta["Study"].values
y_bio = dat_meta[["hivstatus", "Age"]].values

# ============================================================
# 4️⃣ Define Optuna Objective
# ============================================================
def objective(trial: optuna.Trial):
    
    # -------- Model search space --------
    latent_dim = trial.suggest_int("latent_dim", ss_model["latent_dim"][0], ss_model["latent_dim"][1])
    n_layers = trial.suggest_int("n_layers", ss_model["n_layers"][0], ss_model["n_layers"][1])
    base_dim = trial.suggest_categorical("base_dim", ss_model["base_dim"])
    dropout_rate = trial.suggest_float("dropout_rate", ss_model["dropout_rate"][0], ss_model["dropout_rate"][1])
    activation = trial.suggest_categorical("activation", ss_model["activation"])
    strategy = trial.suggest_categorical("strategy", ss_cluster["strategy"])
    
    # -------- Training search space --------
    epochs = trial.suggest_categorical("epochs", ss_train["epochs"])
    batch_size = trial.suggest_categorical("batch_size", ss_train["batch_size"])
    learning_rate = trial.suggest_float("learning_rate", ss_train[l])
    
    
    
    
    
    



