# 02. FM-GSEA: Pathway Analysis
## Foundation Model Gene Set Enrichment Analysis

**목적:**
- mTOR/glycolysis pathway 직접 증거 확보
- Yu et al. 2025 HBsAg/IL-15Rβ/mTOR axis 연결
- Tahoe-x1 embedding과 pathway 통합

**Key Pathways:**
1. mTOR signaling
2. Glycolysis
3. Oxidative phosphorylation
4. NK cell cytotoxicity
5. IL-15 signaling

---

## 1. Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
PROJECT_ROOT = '/content/drive/MyDrive/ITLAS'
sys.path.insert(0, PROJECT_ROOT)

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu, spearmanr

sc.settings.verbosity = 2

## 2. Load Data

In [None]:
# IT score가 포함된 데이터 로드 (01에서 저장한 파일)
data_path = f"{PROJECT_ROOT}/data/processed/GSE182159_with_IT_scores.h5ad"

try:
    adata = sc.read_h5ad(data_path)
    print(f"✓ Loaded: {adata.shape}")
except FileNotFoundError:
    print("⚠ Run 01_IT_Signature_Score.ipynb first!")
    print("  Or load raw data:")
    # Fallback: raw data 로드
    import glob
    h5ad_paths = glob.glob(f"{PROJECT_ROOT}/data/raw/*.h5ad")
    if h5ad_paths:
        adatas = [sc.read_h5ad(p) for p in h5ad_paths]
        adata = sc.concat(adatas) if len(adatas) > 1 else adatas[0]
        print(f"✓ Loaded raw data: {adata.shape}")

In [None]:
# Stage 분포
print("Stage distribution:")
print(adata.obs['Stage'].value_counts())

## 3. Pathway Score Calculation

In [None]:
from itlas.fm_gsea import PATHWAYS, calculate_pathway_scores, compare_pathway_by_stage

# Pathway 정보 확인
print("=" * 60)
print("Available Pathways for HBV IT-immunopathogenesis")
print("=" * 60)

for name, info in PATHWAYS.items():
    genes = info['genes']
    found = [g for g in genes if g in adata.var_names]
    print(f"\n{name}:")
    print(f"  Genes: {len(found)}/{len(genes)}")
    print(f"  Relevance: {info['relevance']}")

In [None]:
# Pathway scores 계산
pathway_scores = calculate_pathway_scores(adata)

print(f"\n✓ Calculated pathway scores for {len(pathway_scores.columns)} pathways")
print(f"  Shape: {pathway_scores.shape}")

# adata.obs에 추가
for col in pathway_scores.columns:
    adata.obs[f'PW_{col}'] = pathway_scores[col].values

## 4. Stage-wise Pathway Comparison

In [None]:
# Stage별 pathway 비교
comparison_df = compare_pathway_by_stage(adata, stage_col='Stage')

print("\nPathway Activity by Stage:")
print("=" * 70)
display(comparison_df)

In [None]:
# Heatmap: Pathway × Stage
from itlas.utils import STAGE_ORDER

# Mean pathway scores per stage
pathway_scores['Stage'] = adata.obs['Stage'].values
stage_means = pathway_scores.groupby('Stage').mean()

# Reorder
order = [s for s in STAGE_ORDER if s in stage_means.index]
stage_means = stage_means.loc[order]

# Z-score normalize
stage_means_z = (stage_means - stage_means.mean()) / stage_means.std()

fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(
    stage_means_z.T,
    cmap='RdBu_r',
    center=0,
    annot=True,
    fmt='.2f',
    cbar_kws={'label': 'Z-score'},
    ax=ax
)
ax.set_title('Pathway Activity Across Disease Stages\n(Z-score normalized)')
ax.set_xlabel('Stage')
ax.set_ylabel('Pathway')

plt.tight_layout()
fig.savefig(f"{PROJECT_ROOT}/results/figures/pathway_heatmap.png", dpi=150, bbox_inches='tight')
print(f"✓ Saved: results/figures/pathway_heatmap.png")
plt.show()

## 5. IT vs NL: Key Pathway Analysis

### Focus: mTOR, Glycolysis (Yu et al. 2025 연결)

In [None]:
# IT vs NL 상세 비교
key_pathways = ['mTOR_signaling', 'glycolysis', 'oxidative_phosphorylation', 'nk_cell_cytotoxicity']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

results = []
for ax, pathway in zip(axes.flat, key_pathways):
    col = f'PW_{pathway}'
    if col not in adata.obs.columns:
        continue
    
    # IT vs NL
    it_data = adata.obs.loc[adata.obs['Stage'] == 'IT', col]
    nl_data = adata.obs.loc[adata.obs['Stage'] == 'NL', col]
    
    stat, pval = mannwhitneyu(it_data, nl_data, alternative='two-sided')
    
    results.append({
        'Pathway': pathway,
        'IT_mean': it_data.mean(),
        'NL_mean': nl_data.mean(),
        'Diff': it_data.mean() - nl_data.mean(),
        'p_value': pval,
        'Significant': '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else 'ns'
    })
    
    # Plot
    plot_data = adata.obs[adata.obs['Stage'].isin(['IT', 'NL'])]
    sns.boxplot(data=plot_data, x='Stage', y=col, order=['NL', 'IT'], 
                palette={'NL': '#2ecc71', 'IT': '#3498db'}, ax=ax)
    ax.set_title(f"{pathway}\n(p={pval:.2e})")
    ax.set_ylabel('Pathway Score')

plt.suptitle('Key Pathway Comparison: IT vs NL', fontsize=14, y=1.02)
plt.tight_layout()
fig.savefig(f"{PROJECT_ROOT}/results/figures/IT_vs_NL_pathways.png", dpi=150, bbox_inches='tight')
plt.show()

# Results table
print("\nIT vs NL Pathway Statistics:")
display(pd.DataFrame(results))

## 6. Correlation: IT Signature vs Pathways

In [None]:
# IT signature score와 pathway 상관관계
if 'IT_IT_signature' in adata.obs.columns:
    correlations = []
    
    for pathway in PATHWAYS.keys():
        col = f'PW_{pathway}'
        if col in adata.obs.columns:
            rho, pval = spearmanr(adata.obs['IT_IT_signature'], adata.obs[col])
            correlations.append({
                'Pathway': pathway,
                'Spearman_rho': rho,
                'p_value': pval
            })
    
    corr_df = pd.DataFrame(correlations).sort_values('Spearman_rho', ascending=False)
    
    print("\nIT Signature Score ↔ Pathway Correlations:")
    print("=" * 50)
    display(corr_df)
    
    # Bar plot
    fig, ax = plt.subplots(figsize=(10, 5))
    colors = ['#e74c3c' if x > 0 else '#3498db' for x in corr_df['Spearman_rho']]
    ax.barh(corr_df['Pathway'], corr_df['Spearman_rho'], color=colors)
    ax.axvline(0, color='black', linewidth=0.5)
    ax.set_xlabel('Spearman Correlation (ρ)')
    ax.set_title('IT Signature Score ↔ Pathway Activity Correlation')
    plt.tight_layout()
    fig.savefig(f"{PROJECT_ROOT}/results/figures/IT_pathway_correlation.png", dpi=150)
    plt.show()

## 7. Tahoe-x1 Embedding Analysis (Optional)

**Note:** GPU 필요 (T4 이상 권장)

In [None]:
# GPU 확인
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Tahoe-x1 설치 (필요시)
# !pip install tahoe-x1  # 또는 git clone으로 설치

# Embedding 추출은 별도 노트북에서 진행 권장
# (대용량 데이터 처리 시 메모리 이슈 가능)

print("⚠ Tahoe-x1 embedding extraction:")
print("  - 대용량 데이터는 batch 처리 필요")
print("  - 별도 노트북 (02b_Tahoe_Embedding.ipynb) 권장")
print("  - 이미 embedding이 있으면 아래 셀에서 로드")

In [None]:
# 기존 embedding 로드 (있는 경우)
embedding_path = f"{PROJECT_ROOT}/data/embeddings/tahoe_embeddings.npy"

import os
if os.path.exists(embedding_path):
    embeddings = np.load(embedding_path)
    adata.obsm['X_tahoe'] = embeddings
    print(f"✓ Loaded Tahoe embeddings: {embeddings.shape}")
    
    # Embedding-pathway 상관관계 분석
    from itlas.fm_gsea import embedding_pathway_correlation
    corr_matrix = embedding_pathway_correlation(adata, n_components=10)
    
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.heatmap(corr_matrix, cmap='RdBu_r', center=0, annot=True, fmt='.2f', ax=ax)
    ax.set_title('Tahoe-x1 Embedding Components ↔ Pathway Correlations')
    plt.tight_layout()
    plt.show()
else:
    print(f"⚠ Embedding file not found: {embedding_path}")
    print("  Run Tahoe-x1 embedding extraction first")

## 8. Save Results

In [None]:
# Pathway scores 저장
pathway_scores.to_csv(f"{PROJECT_ROOT}/results/tables/pathway_scores_by_cell.csv")
print(f"✓ Saved: results/tables/pathway_scores_by_cell.csv")

# Stage comparison 저장
comparison_df.to_csv(f"{PROJECT_ROOT}/results/tables/pathway_stage_comparison.csv", index=False)
print(f"✓ Saved: results/tables/pathway_stage_comparison.csv")

# Updated adata 저장
adata.write(f"{PROJECT_ROOT}/data/processed/GSE182159_with_pathways.h5ad")
print(f"✓ Saved: data/processed/GSE182159_with_pathways.h5ad")

---
## Summary

### Key Findings:
1. **mTOR signaling**: IT vs NL 차이 확인
2. **Glycolysis**: IT phase에서의 변화 분석
3. **OXPHOS**: Mito-high cluster와 연결
4. **NK cytotoxicity**: Cluster 15 collapse 지지

### Yu et al. 2025 연결:
- HBsAg → IL-15Rβ 결합 → mTOR 억제 → glycolysis 감소
- FM-GSEA 결과로 population-level 증거 제공

### Next Step:
- `03_Phase_Classifier.ipynb`: IT/IA/AR/AC 분류기 훈련