# 01. IT Signature Score Analysis
## IT-exclusive marker Í∏∞Î∞ò scoring ÏãúÏä§ÌÖú

**Î™©Ï†Å:**
- IT-exclusive clusters (21, 23, 25) ÎßàÏª§ Í∏∞Î∞ò scoring
- NK collapse (Cluster 15) Î∞òÏòÅ
- StageÎ≥Ñ validation

**Key IT Signatures:**
1. NK cell collapse (Log2OR=-5.15 in IT)
2. Mito-high populations (Clusters 21, 23)
3. B cell differentiation block (Cluster 25)

---

## 1. Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
PROJECT_ROOT = '/content/drive/MyDrive/ITLAS'
sys.path.insert(0, PROJECT_ROOT)

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

sc.settings.verbosity = 2
print(f"‚úì scanpy {sc.__version__}")

## 2. Load Data

In [None]:
# h5ad ÌååÏùº ÏóÖÎ°úÎìú (Ï≤òÏùå Ìïú Î≤àÎßå)
from google.colab import files
import os

raw_dir = f"{PROJECT_ROOT}/data/raw"

# Ïù¥ÎØ∏ ÏóÖÎ°úÎìúÎêú ÌååÏùº ÌôïÏù∏
existing = os.listdir(raw_dir) if os.path.exists(raw_dir) else []
h5ad_files = [f for f in existing if f.endswith('.h5ad')]

if len(h5ad_files) == 0:
    print("üìÇ GSE182159 h5ad ÌååÏùºÎì§ÏùÑ ÏóÖÎ°úÎìúÌïòÏÑ∏Ïöî")
    print("   (GSE182159_batch_0.h5ad ~ batch_4.h5ad)")
    uploaded = files.upload()
    
    import shutil
    for filename in uploaded.keys():
        shutil.move(f"/content/{filename}", f"{raw_dir}/{filename}")
        print(f"‚úì Moved: {filename}")
else:
    print(f"‚úì Found {len(h5ad_files)} h5ad files:")
    for f in h5ad_files:
        print(f"   - {f}")

In [None]:
# h5ad ÌååÏùº Î°úÎìú Î∞è Î≥ëÌï©
import glob

h5ad_paths = sorted(glob.glob(f"{raw_dir}/*.h5ad"))
print(f"Loading {len(h5ad_paths)} files...")

adatas = []
for path in h5ad_paths:
    adata = sc.read_h5ad(path)
    print(f"  {os.path.basename(path)}: {adata.shape}")
    adatas.append(adata)

# Î≥ëÌï©
if len(adatas) > 1:
    adata = sc.concat(adatas, join='outer')
    print(f"\n‚úì Combined: {adata.shape}")
else:
    adata = adatas[0]
    print(f"\n‚úì Loaded: {adata.shape}")

adata

In [None]:
# Stage Î∂ÑÌè¨ ÌôïÏù∏
if 'Stage' in adata.obs.columns:
    print("Stage distribution:")
    print(adata.obs['Stage'].value_counts())
else:
    print("‚ö† 'Stage' column not found. Available columns:")
    print(adata.obs.columns.tolist())

## 3. Gene ID Mapping

In [None]:
# ENSG ID ‚Üí Gene Symbol Îß§Ìïë ÌôïÏù∏
print("Gene name format:")
print(adata.var_names[:10].tolist())

# ENSG IDÏù∏ Í≤ΩÏö∞ Îß§Ìïë ÌïÑÏöî
is_ensg = adata.var_names[0].startswith('ENSG')
print(f"\nUsing ENSG IDs: {is_ensg}")

In [None]:
# Gene ID Îß§Ìïë (ÌïÑÏöîÌïú Í≤ΩÏö∞)
from itlas.utils import GENE_ID_MAP

if is_ensg:
    # Ï∂îÍ∞Ä Îß§Ìïë Ï†ïÏùò (ÌïÑÏöîÏãú ÌôïÏû•)
    extended_map = {
        **GENE_ID_MAP,
        # IT-exclusive cluster markers
        'ENSG00000198899': 'MT-ATP6',
        'ENSG00000127528': 'KLF2',
        'ENSG00000251562': 'MALAT1',
        'ENSG00000212907': 'MT-ND4L',
        'ENSG00000198938': 'MT-CO3',
        'ENSG00000105372': 'NACA',
        # B cell markers
        'ENSG00000105369': 'CD79A',
        'ENSG00000156738': 'MS4A1',
        'ENSG00000204287': 'HLA-DRA',
        'ENSG00000019582': 'CD74',
        # NK markers
        'ENSG00000101439': 'CST7',
        'ENSG00000171848': 'SRGN',
        'ENSG00000144476': 'CXCR4',
        'ENSG00000153234': 'NR4A2',
        'ENSG00000158050': 'DUSP2',
    }
    
    # Îß§Ìïë Ï†ÅÏö©
    new_names = [extended_map.get(n, n) for n in adata.var_names]
    adata.var['original_id'] = adata.var_names.tolist()
    adata.var_names = new_names
    
    mapped = sum(1 for n in new_names if not n.startswith('ENSG'))
    print(f"‚úì Mapped {mapped} genes to symbols")

## 4. IT Signature Score Calculation

In [None]:
from itlas.signature_score import (
    IT_EXCLUSIVE_MARKERS,
    NK_COLLAPSE_MARKERS,
    calculate_it_signature_score,
    validate_against_ground_truth
)

# ÎßàÏª§ Ïú†Ï†ÑÏûê Ï°¥Ïû¨ ÌôïÏù∏
print("=" * 50)
print("Checking marker gene availability")
print("=" * 50)

for cluster, info in IT_EXCLUSIVE_MARKERS.items():
    genes = info['genes']
    found = [g for g in genes if g in adata.var_names]
    print(f"\n{cluster} ({info['cell_type']}):")
    print(f"  Found: {len(found)}/{len(genes)} - {found}")

print(f"\nNK collapse markers:")
nk_found = [g for g in NK_COLLAPSE_MARKERS['genes'] if g in adata.var_names]
print(f"  Found: {len(nk_found)}/{len(NK_COLLAPSE_MARKERS['genes'])} - {nk_found}")

In [None]:
# IT Signature Score Í≥ÑÏÇ∞
scores = calculate_it_signature_score(adata, return_components=True)

# adata.obsÏóê Ï†ÄÏû•
for name, score in scores.items():
    adata.obs[f'IT_{name}'] = score

print("‚úì IT Signature Scores calculated:")
for name in scores.keys():
    print(f"  - IT_{name}")

## 5. Validation Against Known Stages

In [None]:
# StageÎ≥Ñ IT score ÎπÑÍµê
if 'Stage' in adata.obs.columns:
    validation = validate_against_ground_truth(adata, stage_col='Stage')
    print("\nIT Signature Score by Stage:")
    print("=" * 60)
    display(validation)
else:
    print("‚ö† Stage column not available for validation")

In [None]:
# Visualization: StageÎ≥Ñ IT score Î∂ÑÌè¨
from itlas.utils import STAGE_ORDER, STAGE_COLORS

if 'Stage' in adata.obs.columns:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    score_cols = ['IT_IT_signature', 'IT_cluster_21', 'IT_nk_collapse', 'IT_cluster_25']
    titles = ['Combined IT Score', 'Mito-high (Cl.21)', 'NK Collapse', 'B cell Block (Cl.25)']
    
    order = [s for s in STAGE_ORDER if s in adata.obs['Stage'].unique()]
    colors = [STAGE_COLORS.get(s, '#333') for s in order]
    
    for ax, col, title in zip(axes.flat, score_cols, titles):
        if col in adata.obs.columns:
            sns.boxplot(
                data=adata.obs, x='Stage', y=col,
                order=order, palette=colors, ax=ax
            )
            ax.set_title(title)
            ax.set_xlabel('')
    
    plt.suptitle('IT Signature Scores by Disease Stage', fontsize=14, y=1.02)
    plt.tight_layout()
    
    # Ï†ÄÏû•
    fig.savefig(f"{PROJECT_ROOT}/results/figures/IT_signature_by_stage.png", dpi=150, bbox_inches='tight')
    print(f"‚úì Saved: results/figures/IT_signature_by_stage.png")
    plt.show()

In [None]:
# Statistical test: IT vs other stages
from scipy.stats import mannwhitneyu, kruskal

if 'Stage' in adata.obs.columns and 'IT' in adata.obs['Stage'].values:
    it_scores = adata.obs.loc[adata.obs['Stage'] == 'IT', 'IT_IT_signature']
    
    print("IT vs Other Stages (Mann-Whitney U test):")
    print("=" * 50)
    
    results = []
    for stage in ['NL', 'IA', 'AR', 'AC']:
        if stage in adata.obs['Stage'].values:
            other_scores = adata.obs.loc[adata.obs['Stage'] == stage, 'IT_IT_signature']
            stat, pval = mannwhitneyu(it_scores, other_scores, alternative='greater')
            
            results.append({
                'Comparison': f'IT vs {stage}',
                'IT_mean': it_scores.mean(),
                f'{stage}_mean': other_scores.mean(),
                'U_statistic': stat,
                'p_value': pval,
                'Significant': '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else 'ns'
            })
    
    results_df = pd.DataFrame(results)
    display(results_df)
    
    # Ï†ÄÏû•
    results_df.to_csv(f"{PROJECT_ROOT}/results/tables/IT_score_statistics.csv", index=False)
    print(f"\n‚úì Saved: results/tables/IT_score_statistics.csv")

## 6. UMAP Visualization

In [None]:
# UMAPÏù¥ ÏóÜÏúºÎ©¥ Í≥ÑÏÇ∞
if 'X_umap' not in adata.obsm:
    print("Computing UMAP...")
    sc.pp.pca(adata, n_comps=50)
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)
    print("‚úì UMAP computed")

In [None]:
# IT Signature Score on UMAP
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Stage
sc.pl.umap(adata, color='Stage', ax=axes[0], show=False, 
           palette=STAGE_COLORS, title='Disease Stage')

# IT Signature Score
sc.pl.umap(adata, color='IT_IT_signature', ax=axes[1], show=False,
           cmap='RdBu_r', title='IT Signature Score')

# NK Collapse Score
sc.pl.umap(adata, color='IT_nk_collapse', ax=axes[2], show=False,
           cmap='RdBu_r', title='NK Collapse Score')

plt.tight_layout()
fig.savefig(f"{PROJECT_ROOT}/results/figures/IT_signature_UMAP.png", dpi=150, bbox_inches='tight')
print(f"‚úì Saved: results/figures/IT_signature_UMAP.png")
plt.show()

## 7. IT-like Cell Classification

In [None]:
# Z-score thresholdÎ°ú IT-like ÏÑ∏Ìè¨ Î∂ÑÎ•ò
threshold = 0.5  # Z-score > 0.5Î•º IT-likeÎ°ú Î∂ÑÎ•ò

adata.obs['IT_like'] = np.where(
    adata.obs['IT_IT_signature'] > threshold, 
    'IT-like', 
    'non-IT'
)

# StageÎ≥Ñ IT-like ÎπÑÏú®
if 'Stage' in adata.obs.columns:
    crosstab = pd.crosstab(
        adata.obs['Stage'], 
        adata.obs['IT_like'], 
        normalize='index'
    ) * 100
    
    print("IT-like cell proportion by Stage (%):")
    print("=" * 40)
    display(crosstab.round(2))
    
    # ÏãúÍ∞ÅÌôî
    fig, ax = plt.subplots(figsize=(8, 5))
    order = [s for s in STAGE_ORDER if s in crosstab.index]
    crosstab.loc[order, 'IT-like'].plot(kind='bar', color=[STAGE_COLORS.get(s, '#333') for s in order], ax=ax)
    ax.set_ylabel('IT-like cells (%)')
    ax.set_title(f'IT-like Cell Proportion (threshold={threshold})')
    ax.set_xticklabels(order, rotation=0)
    
    plt.tight_layout()
    fig.savefig(f"{PROJECT_ROOT}/results/figures/IT_like_proportion.png", dpi=150)
    plt.show()

## 8. Save Results

In [None]:
# IT signatureÍ∞Ä Ï∂îÍ∞ÄÎêú adata Ï†ÄÏû•
output_path = f"{PROJECT_ROOT}/data/processed/GSE182159_with_IT_scores.h5ad"
adata.write(output_path)
print(f"‚úì Saved: {output_path}")
print(f"  Shape: {adata.shape}")
print(f"  New columns: IT_IT_signature, IT_cluster_21, IT_cluster_23, IT_cluster_25, IT_nk_collapse, IT_like")

---
## Summary

### Key Results:
1. **IT Signature Score** calculated for all cells
2. **Validation**: IT phase shows highest IT signature score
3. **IT-like cells** classified based on threshold

### Next Steps:
- `02_FM_GSEA.ipynb`: Pathway analysis (mTOR, glycolysis)
- `03_Phase_Classifier.ipynb`: ML-based phase prediction